1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-02-20 16:55:50 +01:00

Added htmlz and pdf formats.

Added HTML reader/writer and PDF reader.
This commit is contained in:
2020-04-19 13:43:16 +02:00
parent ebeca30bda
commit d2159ed60c
17 changed files with 55 additions and 65 deletions

View File

@@ -53,6 +53,8 @@ Currently, I've tested following input formats:
- rtf - rtf
- mobi - mobi
- fb2 - fb2
- html
- pdf
Note, that old Microsoft doc format is not supported, although old documents Note, that old Microsoft doc format is not supported, although old documents
can be fairly easy converted using text processors programs, lik Word or can be fairly easy converted using text processors programs, lik Word or
@@ -68,6 +70,7 @@ Currently, following formats are supported:
- epub - epub
- mobi - mobi
- docx - docx
- htmlz (zipped HTML file with additional assets, like images)
Installation Installation

View File

@@ -41,8 +41,7 @@ class PDFInput(InputFormatPlugin):
PDFDocument(xml, self.opts, self.log) PDFDocument(xml, self.opts, self.log)
return os.path.join(getcwd(), 'metadata.opf') return os.path.join(getcwd(), 'metadata.opf')
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log, accelerators):
accelerators):
from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.metadata.opf2 import OPFCreator
from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml

View File

@@ -471,7 +471,7 @@ class HTMLPreProcessor(object):
return re.search('<H2[^><]*id=BookTitle', raw) is not None return re.search('<H2[^><]*id=BookTitle', raw) is not None
def is_pdftohtml(self, src): def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]
def __call__(self, html, remove_special_chars=None, def __call__(self, html, remove_special_chars=None,
get_preprocess_html=False): get_preprocess_html=False):
@@ -627,7 +627,7 @@ class HTMLPreProcessor(object):
html = preprocessor(html) html = preprocessor(html)
if is_pdftohtml: if is_pdftohtml:
html = html.replace('<!-- created by calibre\'s pdftohtml -->', '') html = html.replace('<!-- created by ebook-converter\'s pdftohtml -->', '')
if getattr(self.extra_opts, 'smarten_punctuation', False): if getattr(self.extra_opts, 'smarten_punctuation', False):
html = smarten_punctuation(html, self.log) html = smarten_punctuation(html, self.log)

View File

@@ -43,7 +43,7 @@ class HeuristicProcessor(object):
self.common_in_text_beginnings = '[\\w\'\"“‘‛]' self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
def is_pdftohtml(self, src): def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000] return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]
def is_abbyy(self, src): def is_abbyy(self, src):
return '<meta name="generator" content="ABBYY FineReader' in src[:1000] return '<meta name="generator" content="ABBYY FineReader' in src[:1000]

View File

@@ -15,7 +15,7 @@ from ebook_converter.ebooks import parse_css_length
from ebook_converter.ebooks.docx.writer.utils import convert_color, int_or_zero from ebook_converter.ebooks.docx.writer.utils import convert_color, int_or_zero
from ebook_converter.utils.localization import lang_as_iso639_1 from ebook_converter.utils.localization import lang_as_iso639_1
from ebook_converter.polyglot.builtins import iteritems, filter, unicode_type from ebook_converter.polyglot.builtins import iteritems, filter, unicode_type
from tinycss.css21 import CSS21Parser from ebook_converter.tinycss.css21 import CSS21Parser
css_parser = CSS21Parser() css_parser = CSS21Parser()

View File

@@ -5,7 +5,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from tinycss.color3 import parse_color_string from ebook_converter.tinycss.color3 import parse_color_string
def int_or_zero(raw): def int_or_zero(raw):

View File

@@ -7,17 +7,17 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, subprocess, shutil, re import os, subprocess, shutil, re
from functools import partial from functools import partial
from calibre import prints from ebook_converter import prints
from calibre.constants import iswindows, ispy3 from ebook_converter.constants import iswindows, ispy3
from calibre.ptempfile import TemporaryDirectory from ebook_converter.ptempfile import TemporaryDirectory
from calibre.ebooks.metadata import ( from ebook_converter.ebooks.metadata import (
MetaInformation, string_to_authors, check_isbn, check_doi) MetaInformation, string_to_authors, check_isbn, check_doi)
from calibre.utils.ipc.simple_worker import fork_job, WorkerError from ebook_converter.utils.ipc.simple_worker import fork_job, WorkerError
from polyglot.builtins import iteritems, unicode_type from ebook_converter.polyglot.builtins import iteritems, unicode_type
def get_tools(): def get_tools():
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML from ebook_converter.ebooks.pdf.pdftohtml import PDFTOHTML
base = os.path.dirname(PDFTOHTML) base = os.path.dirname(PDFTOHTML)
suffix = '.exe' if iswindows else '' suffix = '.exe' if iswindows else ''
pdfinfo = os.path.join(base, 'pdfinfo') + suffix pdfinfo = os.path.join(base, 'pdfinfo') + suffix
@@ -32,12 +32,14 @@ def read_info(outputdir, get_cover):
way to pass unicode paths via command line arguments. This also ensures way to pass unicode paths via command line arguments. This also ensures
that if poppler crashes, no stale file handles are left for the original that if poppler crashes, no stale file handles are left for the original
file, only for src.pdf.''' file, only for src.pdf.'''
os.chdir(outputdir)
pdfinfo, pdftoppm = get_tools() pdfinfo, pdftoppm = get_tools()
source_file = os.path.join(outputdir, 'src.pdf')
cover_file = os.path.join(outputdir, 'cover')
ans = {} ans = {}
try: try:
raw = subprocess.check_output([pdfinfo, '-enc', 'UTF-8', '-isodates', 'src.pdf']) raw = subprocess.check_output([pdfinfo, '-enc', 'UTF-8', '-isodates',
source_file])
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
prints('pdfinfo errored out with return code: %d'%e.returncode) prints('pdfinfo errored out with return code: %d'%e.returncode)
return None return None
@@ -61,7 +63,7 @@ def read_info(outputdir, get_cover):
# https://cgit.freedesktop.org/poppler/poppler/commit/?id=c91483aceb1b640771f572cb3df9ad707e5cad0d # https://cgit.freedesktop.org/poppler/poppler/commit/?id=c91483aceb1b640771f572cb3df9ad707e5cad0d
# we can no longer rely on it. # we can no longer rely on it.
try: try:
raw = subprocess.check_output([pdfinfo, '-meta', 'src.pdf']).strip() raw = subprocess.check_output([pdfinfo, '-meta', source_file]).strip()
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
prints('pdfinfo failed to read XML metadata with return code: %d'%e.returncode) prints('pdfinfo failed to read XML metadata with return code: %d'%e.returncode)
else: else:
@@ -74,8 +76,8 @@ def read_info(outputdir, get_cover):
if get_cover: if get_cover:
try: try:
subprocess.check_call([pdftoppm, '-singlefile', '-jpeg', '-cropbox', subprocess.check_call([pdftoppm, '-singlefile', '-jpeg',
'src.pdf', 'cover']) '-cropbox', source_file, cover_file])
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
prints('pdftoppm errored out with return code: %d'%e.returncode) prints('pdftoppm errored out with return code: %d'%e.returncode)
@@ -114,17 +116,7 @@ def get_metadata(stream, cover=True):
stream.seek(0) stream.seek(0)
with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f: with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
shutil.copyfileobj(stream, f) shutil.copyfileobj(stream, f)
try: info = read_info(pdfpath, bool(cover))
res = fork_job('calibre.ebooks.metadata.pdf', 'read_info',
(pdfpath, bool(cover)))
except WorkerError as e:
prints(e.orig_tb)
raise RuntimeError('Failed to run pdfinfo')
info = res['result']
with open(res['stdout_stderr'], 'rb') as f:
raw = f.read().strip()
if raw:
prints(raw)
if info is None: if info is None:
raise ValueError('Could not read info dict from PDF') raise ValueError('Could not read info dict from PDF')
covpath = os.path.join(pdfpath, 'cover.jpg') covpath = os.path.join(pdfpath, 'cover.jpg')
@@ -140,8 +132,6 @@ def get_metadata(stream, cover=True):
else: else:
au = string_to_authors(au) au = string_to_authors(au)
mi = MetaInformation(title, au) mi = MetaInformation(title, au)
# if isbn is not None:
# mi.isbn = isbn
creator = info.get('Creator', None) creator = info.get('Creator', None)
if creator: if creator:
@@ -161,7 +151,7 @@ def get_metadata(stream, cover=True):
mi.tags.insert(0, subject) mi.tags.insert(0, subject)
if 'xmp_metadata' in info: if 'xmp_metadata' in info:
from calibre.ebooks.metadata.xmp import consolidate_metadata from ebook_converter.ebooks.metadata.xmp import consolidate_metadata
mi = consolidate_metadata(mi, info) mi = consolidate_metadata(mi, info)
# Look for recognizable identifiers in the info dict, if they were not # Look for recognizable identifiers in the info dict, if they were not
@@ -182,9 +172,8 @@ def get_metadata(stream, cover=True):
get_quick_metadata = partial(get_metadata, cover=False) get_quick_metadata = partial(get_metadata, cover=False)
from calibre.utils.podofo import set_metadata as podofo_set_metadata #from ebook_converter.utils.podofo import set_metadata as podofo_set_metadata
def set_metadata(stream, mi): def set_metadata(stream, mi):
stream.seek(0) return None
return podofo_set_metadata(stream, mi)

View File

@@ -12,14 +12,14 @@ from collections import defaultdict
from lxml import etree from lxml import etree
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
from calibre import prints from ebook_converter import prints
from calibre.ebooks.metadata import check_isbn, check_doi from ebook_converter.ebooks.metadata import check_isbn, check_doi
from calibre.utils.xml_parse import safe_xml_fromstring from ebook_converter.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.metadata.book.base import Metadata from ebook_converter.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.opf2 import dump_dict from ebook_converter.ebooks.metadata.opf2 import dump_dict
from calibre.utils.date import parse_date, isoformat, now from ebook_converter.utils.date import parse_date, isoformat, now
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
from polyglot.builtins import iteritems, string_or_bytes, filter from ebook_converter.polyglot.builtins import iteritems, string_or_bytes, filter
_xml_declaration = re.compile(r'<\?xml[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE) _xml_declaration = re.compile(r'<\?xml[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE)
@@ -173,8 +173,8 @@ def read_series(root):
def read_user_metadata(mi, root): def read_user_metadata(mi, root):
from calibre.utils.config import from_json from ebook_converter.utils.config import from_json
from calibre.ebooks.metadata.book.json_codec import decode_is_multiple from ebook_converter.ebooks.metadata.book.json_codec import decode_is_multiple
fields = set() fields = set()
for item in XPath('//calibre:custom_metadata')(root): for item in XPath('//calibre:custom_metadata')(root):
for li in XPath('./rdf:Bag/rdf:li')(item): for li in XPath('./rdf:Bag/rdf:li')(item):
@@ -436,8 +436,8 @@ def create_series(calibre, series, series_index):
def create_user_metadata(calibre, all_user_metadata): def create_user_metadata(calibre, all_user_metadata):
from calibre.utils.config import to_json from ebook_converter.utils.config import to_json
from calibre.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple from ebook_converter.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple
s = calibre.makeelement(expand('calibre:custom_metadata')) s = calibre.makeelement(expand('calibre:custom_metadata'))
calibre.append(s) calibre.append(s)
@@ -640,7 +640,7 @@ def merge_xmp_packet(old, new):
if __name__ == '__main__': if __name__ == '__main__':
from calibre.utils.podofo import get_xmp_metadata from ebook_converter.utils.podofo import get_xmp_metadata
xmp_packet = get_xmp_metadata(sys.argv[-1]) xmp_packet = get_xmp_metadata(sys.argv[-1])
mi = metadata_from_xmp_packet(xmp_packet) mi = metadata_from_xmp_packet(xmp_packet)
np = metadata_to_xmp_packet(mi) np = metadata_to_xmp_packet(mi)

View File

@@ -14,7 +14,7 @@ from ebook_converter.utils.img import save_cover_data_to, scale_image, image_to_
from ebook_converter.utils.imghdr import what from ebook_converter.utils.imghdr import what
from ebook_converter.ebooks import normalize from ebook_converter.ebooks import normalize
from ebook_converter.polyglot.builtins import unicode_type, range, as_bytes, map from ebook_converter.polyglot.builtins import unicode_type, range, as_bytes, map
from tinycss.color3 import parse_color_string from ebook_converter.tinycss.color3 import parse_color_string
IMAGE_MAX_SIZE = 10 * 1024 * 1024 IMAGE_MAX_SIZE = 10 * 1024 * 1024
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed)) RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))

View File

@@ -500,7 +500,6 @@ class Style(object):
background shortcut properties. Note that inheritance/default values background shortcut properties. Note that inheritance/default values
are not used. None is returned if no background color is set. are not used. None is returned if no background color is set.
''' '''
def validate_color(col): def validate_color(col):
return cssprofiles.validateWithProfile('color', return cssprofiles.validateWithProfile('color',
col, col,

View File

@@ -98,7 +98,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
with lopen(index, 'r+b') as i: with lopen(index, 'r+b') as i:
raw = i.read().decode('utf-8', 'replace') raw = i.read().decode('utf-8', 'replace')
raw = flip_images(raw) raw = flip_images(raw)
raw = raw.replace('<head', '<!-- created by ebook_converter\'s pdftohtml -->\n <head', 1) raw = raw.replace('<head', '<!-- created by ebook-converter\'s pdftohtml -->\n <head', 1)
i.seek(0) i.seek(0)
i.truncate() i.truncate()
# versions of pdftohtml >= 0.20 output self closing <br> tags, this # versions of pdftohtml >= 0.20 output self closing <br> tags, this

View File

@@ -12,9 +12,9 @@
from .version import VERSION from .version import VERSION
__version__ = VERSION __version__ = VERSION
from tinycss.css21 import CSS21Parser from ebook_converter.tinycss.css21 import CSS21Parser
from tinycss.page3 import CSSPage3Parser from ebook_converter.tinycss.page3 import CSSPage3Parser
from tinycss.fonts3 import CSSFonts3Parser from ebook_converter.tinycss.fonts3 import CSSFonts3Parser
from ebook_converter.tinycss.media3 import CSSMedia3Parser from ebook_converter.tinycss.media3 import CSSMedia3Parser

View File

@@ -13,10 +13,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from itertools import chain, islice from itertools import chain, islice
from tinycss.decoding import decode from ebook_converter.tinycss.decoding import decode
from tinycss.token_data import TokenList from ebook_converter.tinycss.token_data import TokenList
from tinycss.tokenizer import tokenize_grouped from ebook_converter.tinycss.tokenizer import tokenize_grouped
from tinycss.parsing import ( from ebook_converter.tinycss.parsing import (
strip_whitespace, remove_whitespace, split_on_comma, validate_value, strip_whitespace, remove_whitespace, split_on_comma, validate_value,
validate_any, ParseError) validate_any, ParseError)

View File

@@ -15,7 +15,7 @@ from __future__ import unicode_literals
import operator import operator
import re import re
from polyglot.binary import from_hex_bytes from ebook_converter.polyglot.binary import from_hex_bytes
__all__ = ['decode'] # Everything else is implementation detail __all__ = ['decode'] # Everything else is implementation detail

View File

@@ -8,8 +8,8 @@ __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import re import re
from ebook_converter.polyglot.builtins import map from ebook_converter.polyglot.builtins import map
from tinycss.css21 import CSS21Parser, ParseError from ebook_converter.tinycss.css21 import CSS21Parser, ParseError
from tinycss.tokenizer import tokenize_grouped from ebook_converter.tinycss.tokenizer import tokenize_grouped
def parse_font_family_tokens(tokens): def parse_font_family_tokens(tokens):

View File

@@ -5,8 +5,8 @@ from __future__ import absolute_import, division, print_function, unicode_litera
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from tinycss.css21 import CSS21Parser from ebook_converter.tinycss.css21 import CSS21Parser
from tinycss.parsing import remove_whitespace, split_on_comma, ParseError from ebook_converter.tinycss.parsing import remove_whitespace, split_on_comma, ParseError
from ebook_converter.polyglot.builtins import error_message from ebook_converter.polyglot.builtins import error_message

View File

@@ -14,7 +14,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from tinycss import token_data from ebook_converter.tinycss import token_data
def tokenize_flat(css_source, ignore_comments=True, def tokenize_flat(css_source, ignore_comments=True,