mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-01 15:32:26 +01:00
Added htmlz and pdf formats.
Added HTML reader/writer and PDF reader.
This commit is contained in:
@@ -53,6 +53,8 @@ Currently, I've tested following input formats:
|
||||
- rtf
|
||||
- mobi
|
||||
- fb2
|
||||
- html
|
||||
- pdf
|
||||
|
||||
Note, that old Microsoft doc format is not supported, although old documents
|
||||
can be fairly easy converted using text processors programs, lik Word or
|
||||
@@ -68,6 +70,7 @@ Currently, following formats are supported:
|
||||
- epub
|
||||
- mobi
|
||||
- docx
|
||||
- htmlz (zipped HTML file with additional assets, like images)
|
||||
|
||||
|
||||
Installation
|
||||
|
||||
@@ -41,8 +41,7 @@ class PDFInput(InputFormatPlugin):
|
||||
PDFDocument(xml, self.opts, self.log)
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
|
||||
from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml
|
||||
|
||||
|
||||
@@ -471,7 +471,7 @@ class HTMLPreProcessor(object):
|
||||
return re.search('<H2[^><]*id=BookTitle', raw) is not None
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def __call__(self, html, remove_special_chars=None,
|
||||
get_preprocess_html=False):
|
||||
@@ -627,7 +627,7 @@ class HTMLPreProcessor(object):
|
||||
html = preprocessor(html)
|
||||
|
||||
if is_pdftohtml:
|
||||
html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')
|
||||
html = html.replace('<!-- created by ebook-converter\'s pdftohtml -->', '')
|
||||
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = smarten_punctuation(html, self.log)
|
||||
|
||||
@@ -43,7 +43,7 @@ class HeuristicProcessor(object):
|
||||
self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def is_abbyy(self, src):
|
||||
return '<meta name="generator" content="ABBYY FineReader' in src[:1000]
|
||||
|
||||
@@ -15,7 +15,7 @@ from ebook_converter.ebooks import parse_css_length
|
||||
from ebook_converter.ebooks.docx.writer.utils import convert_color, int_or_zero
|
||||
from ebook_converter.utils.localization import lang_as_iso639_1
|
||||
from ebook_converter.polyglot.builtins import iteritems, filter, unicode_type
|
||||
from tinycss.css21 import CSS21Parser
|
||||
from ebook_converter.tinycss.css21 import CSS21Parser
|
||||
|
||||
css_parser = CSS21Parser()
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from tinycss.color3 import parse_color_string
|
||||
from ebook_converter.tinycss.color3 import parse_color_string
|
||||
|
||||
|
||||
def int_or_zero(raw):
|
||||
|
||||
@@ -7,17 +7,17 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
import os, subprocess, shutil, re
|
||||
from functools import partial
|
||||
|
||||
from calibre import prints
|
||||
from calibre.constants import iswindows, ispy3
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.ebooks.metadata import (
|
||||
from ebook_converter import prints
|
||||
from ebook_converter.constants import iswindows, ispy3
|
||||
from ebook_converter.ptempfile import TemporaryDirectory
|
||||
from ebook_converter.ebooks.metadata import (
|
||||
MetaInformation, string_to_authors, check_isbn, check_doi)
|
||||
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
||||
from polyglot.builtins import iteritems, unicode_type
|
||||
from ebook_converter.utils.ipc.simple_worker import fork_job, WorkerError
|
||||
from ebook_converter.polyglot.builtins import iteritems, unicode_type
|
||||
|
||||
|
||||
def get_tools():
|
||||
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
|
||||
from ebook_converter.ebooks.pdf.pdftohtml import PDFTOHTML
|
||||
base = os.path.dirname(PDFTOHTML)
|
||||
suffix = '.exe' if iswindows else ''
|
||||
pdfinfo = os.path.join(base, 'pdfinfo') + suffix
|
||||
@@ -32,12 +32,14 @@ def read_info(outputdir, get_cover):
|
||||
way to pass unicode paths via command line arguments. This also ensures
|
||||
that if poppler crashes, no stale file handles are left for the original
|
||||
file, only for src.pdf.'''
|
||||
os.chdir(outputdir)
|
||||
pdfinfo, pdftoppm = get_tools()
|
||||
source_file = os.path.join(outputdir, 'src.pdf')
|
||||
cover_file = os.path.join(outputdir, 'cover')
|
||||
ans = {}
|
||||
|
||||
try:
|
||||
raw = subprocess.check_output([pdfinfo, '-enc', 'UTF-8', '-isodates', 'src.pdf'])
|
||||
raw = subprocess.check_output([pdfinfo, '-enc', 'UTF-8', '-isodates',
|
||||
source_file])
|
||||
except subprocess.CalledProcessError as e:
|
||||
prints('pdfinfo errored out with return code: %d'%e.returncode)
|
||||
return None
|
||||
@@ -61,7 +63,7 @@ def read_info(outputdir, get_cover):
|
||||
# https://cgit.freedesktop.org/poppler/poppler/commit/?id=c91483aceb1b640771f572cb3df9ad707e5cad0d
|
||||
# we can no longer rely on it.
|
||||
try:
|
||||
raw = subprocess.check_output([pdfinfo, '-meta', 'src.pdf']).strip()
|
||||
raw = subprocess.check_output([pdfinfo, '-meta', source_file]).strip()
|
||||
except subprocess.CalledProcessError as e:
|
||||
prints('pdfinfo failed to read XML metadata with return code: %d'%e.returncode)
|
||||
else:
|
||||
@@ -74,8 +76,8 @@ def read_info(outputdir, get_cover):
|
||||
|
||||
if get_cover:
|
||||
try:
|
||||
subprocess.check_call([pdftoppm, '-singlefile', '-jpeg', '-cropbox',
|
||||
'src.pdf', 'cover'])
|
||||
subprocess.check_call([pdftoppm, '-singlefile', '-jpeg',
|
||||
'-cropbox', source_file, cover_file])
|
||||
except subprocess.CalledProcessError as e:
|
||||
prints('pdftoppm errored out with return code: %d'%e.returncode)
|
||||
|
||||
@@ -114,17 +116,7 @@ def get_metadata(stream, cover=True):
|
||||
stream.seek(0)
|
||||
with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
|
||||
shutil.copyfileobj(stream, f)
|
||||
try:
|
||||
res = fork_job('calibre.ebooks.metadata.pdf', 'read_info',
|
||||
(pdfpath, bool(cover)))
|
||||
except WorkerError as e:
|
||||
prints(e.orig_tb)
|
||||
raise RuntimeError('Failed to run pdfinfo')
|
||||
info = res['result']
|
||||
with open(res['stdout_stderr'], 'rb') as f:
|
||||
raw = f.read().strip()
|
||||
if raw:
|
||||
prints(raw)
|
||||
info = read_info(pdfpath, bool(cover))
|
||||
if info is None:
|
||||
raise ValueError('Could not read info dict from PDF')
|
||||
covpath = os.path.join(pdfpath, 'cover.jpg')
|
||||
@@ -140,8 +132,6 @@ def get_metadata(stream, cover=True):
|
||||
else:
|
||||
au = string_to_authors(au)
|
||||
mi = MetaInformation(title, au)
|
||||
# if isbn is not None:
|
||||
# mi.isbn = isbn
|
||||
|
||||
creator = info.get('Creator', None)
|
||||
if creator:
|
||||
@@ -161,7 +151,7 @@ def get_metadata(stream, cover=True):
|
||||
mi.tags.insert(0, subject)
|
||||
|
||||
if 'xmp_metadata' in info:
|
||||
from calibre.ebooks.metadata.xmp import consolidate_metadata
|
||||
from ebook_converter.ebooks.metadata.xmp import consolidate_metadata
|
||||
mi = consolidate_metadata(mi, info)
|
||||
|
||||
# Look for recognizable identifiers in the info dict, if they were not
|
||||
@@ -182,9 +172,8 @@ def get_metadata(stream, cover=True):
|
||||
|
||||
get_quick_metadata = partial(get_metadata, cover=False)
|
||||
|
||||
from calibre.utils.podofo import set_metadata as podofo_set_metadata
|
||||
#from ebook_converter.utils.podofo import set_metadata as podofo_set_metadata
|
||||
|
||||
|
||||
def set_metadata(stream, mi):
|
||||
stream.seek(0)
|
||||
return podofo_set_metadata(stream, mi)
|
||||
return None
|
||||
|
||||
@@ -12,14 +12,14 @@ from collections import defaultdict
|
||||
from lxml import etree
|
||||
from lxml.builder import ElementMaker
|
||||
|
||||
from calibre import prints
|
||||
from calibre.ebooks.metadata import check_isbn, check_doi
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.ebooks.metadata.opf2 import dump_dict
|
||||
from calibre.utils.date import parse_date, isoformat, now
|
||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
from polyglot.builtins import iteritems, string_or_bytes, filter
|
||||
from ebook_converter import prints
|
||||
from ebook_converter.ebooks.metadata import check_isbn, check_doi
|
||||
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
||||
from ebook_converter.ebooks.metadata.book.base import Metadata
|
||||
from ebook_converter.ebooks.metadata.opf2 import dump_dict
|
||||
from ebook_converter.utils.date import parse_date, isoformat, now
|
||||
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
from ebook_converter.polyglot.builtins import iteritems, string_or_bytes, filter
|
||||
|
||||
_xml_declaration = re.compile(r'<\?xml[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE)
|
||||
|
||||
@@ -173,8 +173,8 @@ def read_series(root):
|
||||
|
||||
|
||||
def read_user_metadata(mi, root):
|
||||
from calibre.utils.config import from_json
|
||||
from calibre.ebooks.metadata.book.json_codec import decode_is_multiple
|
||||
from ebook_converter.utils.config import from_json
|
||||
from ebook_converter.ebooks.metadata.book.json_codec import decode_is_multiple
|
||||
fields = set()
|
||||
for item in XPath('//calibre:custom_metadata')(root):
|
||||
for li in XPath('./rdf:Bag/rdf:li')(item):
|
||||
@@ -436,8 +436,8 @@ def create_series(calibre, series, series_index):
|
||||
|
||||
|
||||
def create_user_metadata(calibre, all_user_metadata):
|
||||
from calibre.utils.config import to_json
|
||||
from calibre.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple
|
||||
from ebook_converter.utils.config import to_json
|
||||
from ebook_converter.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple
|
||||
|
||||
s = calibre.makeelement(expand('calibre:custom_metadata'))
|
||||
calibre.append(s)
|
||||
@@ -640,7 +640,7 @@ def merge_xmp_packet(old, new):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from calibre.utils.podofo import get_xmp_metadata
|
||||
from ebook_converter.utils.podofo import get_xmp_metadata
|
||||
xmp_packet = get_xmp_metadata(sys.argv[-1])
|
||||
mi = metadata_from_xmp_packet(xmp_packet)
|
||||
np = metadata_to_xmp_packet(mi)
|
||||
|
||||
@@ -14,7 +14,7 @@ from ebook_converter.utils.img import save_cover_data_to, scale_image, image_to_
|
||||
from ebook_converter.utils.imghdr import what
|
||||
from ebook_converter.ebooks import normalize
|
||||
from ebook_converter.polyglot.builtins import unicode_type, range, as_bytes, map
|
||||
from tinycss.color3 import parse_color_string
|
||||
from ebook_converter.tinycss.color3 import parse_color_string
|
||||
|
||||
IMAGE_MAX_SIZE = 10 * 1024 * 1024
|
||||
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
|
||||
|
||||
@@ -500,7 +500,6 @@ class Style(object):
|
||||
background shortcut properties. Note that inheritance/default values
|
||||
are not used. None is returned if no background color is set.
|
||||
'''
|
||||
|
||||
def validate_color(col):
|
||||
return cssprofiles.validateWithProfile('color',
|
||||
col,
|
||||
|
||||
@@ -98,7 +98,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
||||
with lopen(index, 'r+b') as i:
|
||||
raw = i.read().decode('utf-8', 'replace')
|
||||
raw = flip_images(raw)
|
||||
raw = raw.replace('<head', '<!-- created by ebook_converter\'s pdftohtml -->\n <head', 1)
|
||||
raw = raw.replace('<head', '<!-- created by ebook-converter\'s pdftohtml -->\n <head', 1)
|
||||
i.seek(0)
|
||||
i.truncate()
|
||||
# versions of pdftohtml >= 0.20 output self closing <br> tags, this
|
||||
|
||||
@@ -12,9 +12,9 @@
|
||||
from .version import VERSION
|
||||
__version__ = VERSION
|
||||
|
||||
from tinycss.css21 import CSS21Parser
|
||||
from tinycss.page3 import CSSPage3Parser
|
||||
from tinycss.fonts3 import CSSFonts3Parser
|
||||
from ebook_converter.tinycss.css21 import CSS21Parser
|
||||
from ebook_converter.tinycss.page3 import CSSPage3Parser
|
||||
from ebook_converter.tinycss.fonts3 import CSSFonts3Parser
|
||||
from ebook_converter.tinycss.media3 import CSSMedia3Parser
|
||||
|
||||
|
||||
|
||||
@@ -13,10 +13,10 @@
|
||||
from __future__ import unicode_literals
|
||||
from itertools import chain, islice
|
||||
|
||||
from tinycss.decoding import decode
|
||||
from tinycss.token_data import TokenList
|
||||
from tinycss.tokenizer import tokenize_grouped
|
||||
from tinycss.parsing import (
|
||||
from ebook_converter.tinycss.decoding import decode
|
||||
from ebook_converter.tinycss.token_data import TokenList
|
||||
from ebook_converter.tinycss.tokenizer import tokenize_grouped
|
||||
from ebook_converter.tinycss.parsing import (
|
||||
strip_whitespace, remove_whitespace, split_on_comma, validate_value,
|
||||
validate_any, ParseError)
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ from __future__ import unicode_literals
|
||||
import operator
|
||||
import re
|
||||
|
||||
from polyglot.binary import from_hex_bytes
|
||||
from ebook_converter.polyglot.binary import from_hex_bytes
|
||||
|
||||
|
||||
__all__ = ['decode'] # Everything else is implementation detail
|
||||
|
||||
@@ -8,8 +8,8 @@ __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re
|
||||
from ebook_converter.polyglot.builtins import map
|
||||
from tinycss.css21 import CSS21Parser, ParseError
|
||||
from tinycss.tokenizer import tokenize_grouped
|
||||
from ebook_converter.tinycss.css21 import CSS21Parser, ParseError
|
||||
from ebook_converter.tinycss.tokenizer import tokenize_grouped
|
||||
|
||||
|
||||
def parse_font_family_tokens(tokens):
|
||||
|
||||
@@ -5,8 +5,8 @@ from __future__ import absolute_import, division, print_function, unicode_litera
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from tinycss.css21 import CSS21Parser
|
||||
from tinycss.parsing import remove_whitespace, split_on_comma, ParseError
|
||||
from ebook_converter.tinycss.css21 import CSS21Parser
|
||||
from ebook_converter.tinycss.parsing import remove_whitespace, split_on_comma, ParseError
|
||||
from ebook_converter.polyglot.builtins import error_message
|
||||
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from tinycss import token_data
|
||||
from ebook_converter.tinycss import token_data
|
||||
|
||||
|
||||
def tokenize_flat(css_source, ignore_comments=True,
|
||||
|
||||
Reference in New Issue
Block a user