From d2159ed60c8dce8089baaa5f64d4a43a7ad7f0ff Mon Sep 17 00:00:00 2001 From: gryf Date: Sun, 19 Apr 2020 13:43:16 +0200 Subject: [PATCH] Added htmlz and pdf formats. Added HTML reader/writer and PDF reader. --- README.rst | 3 ++ .../ebooks/conversion/plugins/pdf_input.py | 3 +- .../ebooks/conversion/preprocess.py | 4 +- ebook_converter/ebooks/conversion/utils.py | 2 +- ebook_converter/ebooks/docx/writer/styles.py | 2 +- ebook_converter/ebooks/docx/writer/utils.py | 2 +- ebook_converter/ebooks/metadata/pdf.py | 47 +++++++------------ ebook_converter/ebooks/metadata/xmp.py | 26 +++++----- ebook_converter/ebooks/mobi/utils.py | 2 +- ebook_converter/ebooks/oeb/stylizer.py | 1 - ebook_converter/ebooks/pdf/pdftohtml.py | 2 +- ebook_converter/tinycss/__init__.py | 6 +-- ebook_converter/tinycss/css21.py | 8 ++-- ebook_converter/tinycss/decoding.py | 2 +- ebook_converter/tinycss/fonts3.py | 4 +- ebook_converter/tinycss/media3.py | 4 +- ebook_converter/tinycss/tokenizer.py | 2 +- 17 files changed, 55 insertions(+), 65 deletions(-) diff --git a/README.rst b/README.rst index c204d6a..e657b11 100644 --- a/README.rst +++ b/README.rst @@ -53,6 +53,8 @@ Currently, I've tested following input formats: - rtf - mobi - fb2 +- html +- pdf Note, that old Microsoft doc format is not supported, although old documents can be fairly easy converted using text processors programs, lik Word or @@ -68,6 +70,7 @@ Currently, following formats are supported: - epub - mobi - docx +- htmlz (zipped HTML file with additional assets, like images) Installation diff --git a/ebook_converter/ebooks/conversion/plugins/pdf_input.py b/ebook_converter/ebooks/conversion/plugins/pdf_input.py index 3dd2f03..558abde 100644 --- a/ebook_converter/ebooks/conversion/plugins/pdf_input.py +++ b/ebook_converter/ebooks/conversion/plugins/pdf_input.py @@ -41,8 +41,7 @@ class PDFInput(InputFormatPlugin): PDFDocument(xml, self.opts, self.log) return os.path.join(getcwd(), 'metadata.opf') - def convert(self, stream, options, file_ext, log, - accelerators): + def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml diff --git a/ebook_converter/ebooks/conversion/preprocess.py b/ebook_converter/ebooks/conversion/preprocess.py index cc821c7..eab5560 100644 --- a/ebook_converter/ebooks/conversion/preprocess.py +++ b/ebook_converter/ebooks/conversion/preprocess.py @@ -471,7 +471,7 @@ class HTMLPreProcessor(object): return re.search('<]*id=BookTitle', raw) is not None def is_pdftohtml(self, src): - return '' in src[:1000] + return '' in src[:1000] def __call__(self, html, remove_special_chars=None, get_preprocess_html=False): @@ -627,7 +627,7 @@ class HTMLPreProcessor(object): html = preprocessor(html) if is_pdftohtml: - html = html.replace('', '') + html = html.replace('', '') if getattr(self.extra_opts, 'smarten_punctuation', False): html = smarten_punctuation(html, self.log) diff --git a/ebook_converter/ebooks/conversion/utils.py b/ebook_converter/ebooks/conversion/utils.py index a53dad5..79a725d 100644 --- a/ebook_converter/ebooks/conversion/utils.py +++ b/ebook_converter/ebooks/conversion/utils.py @@ -43,7 +43,7 @@ class HeuristicProcessor(object): self.common_in_text_beginnings = '[\\w\'\"“‘‛]' def is_pdftohtml(self, src): - return '' in src[:1000] + return '' in src[:1000] def is_abbyy(self, src): return ']*>', re.IGNORECASE) @@ -173,8 +173,8 @@ def read_series(root): def read_user_metadata(mi, root): - from calibre.utils.config import from_json - from calibre.ebooks.metadata.book.json_codec import decode_is_multiple + from ebook_converter.utils.config import from_json + from ebook_converter.ebooks.metadata.book.json_codec import decode_is_multiple fields = set() for item in XPath('//calibre:custom_metadata')(root): for li in XPath('./rdf:Bag/rdf:li')(item): @@ -436,8 +436,8 @@ def create_series(calibre, series, series_index): def create_user_metadata(calibre, all_user_metadata): - from calibre.utils.config import to_json - from calibre.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple + from ebook_converter.utils.config import to_json + from ebook_converter.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple s = calibre.makeelement(expand('calibre:custom_metadata')) calibre.append(s) @@ -640,7 +640,7 @@ def merge_xmp_packet(old, new): if __name__ == '__main__': - from calibre.utils.podofo import get_xmp_metadata + from ebook_converter.utils.podofo import get_xmp_metadata xmp_packet = get_xmp_metadata(sys.argv[-1]) mi = metadata_from_xmp_packet(xmp_packet) np = metadata_to_xmp_packet(mi) diff --git a/ebook_converter/ebooks/mobi/utils.py b/ebook_converter/ebooks/mobi/utils.py index 7dd2943..4b08a91 100644 --- a/ebook_converter/ebooks/mobi/utils.py +++ b/ebook_converter/ebooks/mobi/utils.py @@ -14,7 +14,7 @@ from ebook_converter.utils.img import save_cover_data_to, scale_image, image_to_ from ebook_converter.utils.imghdr import what from ebook_converter.ebooks import normalize from ebook_converter.polyglot.builtins import unicode_type, range, as_bytes, map -from tinycss.color3 import parse_color_string +from ebook_converter.tinycss.color3 import parse_color_string IMAGE_MAX_SIZE = 10 * 1024 * 1024 RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed)) diff --git a/ebook_converter/ebooks/oeb/stylizer.py b/ebook_converter/ebooks/oeb/stylizer.py index 0a2a553..16cba7c 100644 --- a/ebook_converter/ebooks/oeb/stylizer.py +++ b/ebook_converter/ebooks/oeb/stylizer.py @@ -500,7 +500,6 @@ class Style(object): background shortcut properties. Note that inheritance/default values are not used. None is returned if no background color is set. ''' - def validate_color(col): return cssprofiles.validateWithProfile('color', col, diff --git a/ebook_converter/ebooks/pdf/pdftohtml.py b/ebook_converter/ebooks/pdf/pdftohtml.py index 248e37c..40c95b3 100644 --- a/ebook_converter/ebooks/pdf/pdftohtml.py +++ b/ebook_converter/ebooks/pdf/pdftohtml.py @@ -98,7 +98,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): with lopen(index, 'r+b') as i: raw = i.read().decode('utf-8', 'replace') raw = flip_images(raw) - raw = raw.replace('\n \n = 0.20 output self closing
tags, this diff --git a/ebook_converter/tinycss/__init__.py b/ebook_converter/tinycss/__init__.py index 0ec8afe..b994f53 100644 --- a/ebook_converter/tinycss/__init__.py +++ b/ebook_converter/tinycss/__init__.py @@ -12,9 +12,9 @@ from .version import VERSION __version__ = VERSION -from tinycss.css21 import CSS21Parser -from tinycss.page3 import CSSPage3Parser -from tinycss.fonts3 import CSSFonts3Parser +from ebook_converter.tinycss.css21 import CSS21Parser +from ebook_converter.tinycss.page3 import CSSPage3Parser +from ebook_converter.tinycss.fonts3 import CSSFonts3Parser from ebook_converter.tinycss.media3 import CSSMedia3Parser diff --git a/ebook_converter/tinycss/css21.py b/ebook_converter/tinycss/css21.py index 2f1d3f8..7581e8f 100644 --- a/ebook_converter/tinycss/css21.py +++ b/ebook_converter/tinycss/css21.py @@ -13,10 +13,10 @@ from __future__ import unicode_literals from itertools import chain, islice -from tinycss.decoding import decode -from tinycss.token_data import TokenList -from tinycss.tokenizer import tokenize_grouped -from tinycss.parsing import ( +from ebook_converter.tinycss.decoding import decode +from ebook_converter.tinycss.token_data import TokenList +from ebook_converter.tinycss.tokenizer import tokenize_grouped +from ebook_converter.tinycss.parsing import ( strip_whitespace, remove_whitespace, split_on_comma, validate_value, validate_any, ParseError) diff --git a/ebook_converter/tinycss/decoding.py b/ebook_converter/tinycss/decoding.py index 32e1799..f8e86fe 100644 --- a/ebook_converter/tinycss/decoding.py +++ b/ebook_converter/tinycss/decoding.py @@ -15,7 +15,7 @@ from __future__ import unicode_literals import operator import re -from polyglot.binary import from_hex_bytes +from ebook_converter.polyglot.binary import from_hex_bytes __all__ = ['decode'] # Everything else is implementation detail diff --git a/ebook_converter/tinycss/fonts3.py b/ebook_converter/tinycss/fonts3.py index d55e8dd..ee959f7 100644 --- a/ebook_converter/tinycss/fonts3.py +++ b/ebook_converter/tinycss/fonts3.py @@ -8,8 +8,8 @@ __copyright__ = '2014, Kovid Goyal ' import re from ebook_converter.polyglot.builtins import map -from tinycss.css21 import CSS21Parser, ParseError -from tinycss.tokenizer import tokenize_grouped +from ebook_converter.tinycss.css21 import CSS21Parser, ParseError +from ebook_converter.tinycss.tokenizer import tokenize_grouped def parse_font_family_tokens(tokens): diff --git a/ebook_converter/tinycss/media3.py b/ebook_converter/tinycss/media3.py index 9498c78..dff6499 100644 --- a/ebook_converter/tinycss/media3.py +++ b/ebook_converter/tinycss/media3.py @@ -5,8 +5,8 @@ from __future__ import absolute_import, division, print_function, unicode_litera __license__ = 'GPL v3' __copyright__ = '2014, Kovid Goyal ' -from tinycss.css21 import CSS21Parser -from tinycss.parsing import remove_whitespace, split_on_comma, ParseError +from ebook_converter.tinycss.css21 import CSS21Parser +from ebook_converter.tinycss.parsing import remove_whitespace, split_on_comma, ParseError from ebook_converter.polyglot.builtins import error_message diff --git a/ebook_converter/tinycss/tokenizer.py b/ebook_converter/tinycss/tokenizer.py index 154c111..6e03c8b 100644 --- a/ebook_converter/tinycss/tokenizer.py +++ b/ebook_converter/tinycss/tokenizer.py @@ -14,7 +14,7 @@ from __future__ import unicode_literals -from tinycss import token_data +from ebook_converter.tinycss import token_data def tokenize_flat(css_source, ignore_comments=True,