1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-01 15:32:26 +01:00

Added htmlz and pdf formats.

Added HTML reader/writer and PDF reader.
This commit is contained in:
2020-04-19 13:43:16 +02:00
parent ebeca30bda
commit d2159ed60c
17 changed files with 55 additions and 65 deletions

View File

@@ -53,6 +53,8 @@ Currently, I've tested following input formats:
- rtf
- mobi
- fb2
- html
- pdf
Note, that old Microsoft doc format is not supported, although old documents
can be fairly easy converted using text processors programs, lik Word or
@@ -68,6 +70,7 @@ Currently, following formats are supported:
- epub
- mobi
- docx
- htmlz (zipped HTML file with additional assets, like images)
Installation

View File

@@ -41,8 +41,7 @@ class PDFInput(InputFormatPlugin):
PDFDocument(xml, self.opts, self.log)
return os.path.join(getcwd(), 'metadata.opf')
def convert(self, stream, options, file_ext, log,
accelerators):
def convert(self, stream, options, file_ext, log, accelerators):
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml

View File

@@ -471,7 +471,7 @@ class HTMLPreProcessor(object):
return re.search('<H2[^><]*id=BookTitle', raw) is not None
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]
def __call__(self, html, remove_special_chars=None,
get_preprocess_html=False):
@@ -627,7 +627,7 @@ class HTMLPreProcessor(object):
html = preprocessor(html)
if is_pdftohtml:
html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')
html = html.replace('<!-- created by ebook-converter\'s pdftohtml -->', '')
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = smarten_punctuation(html, self.log)

View File

@@ -43,7 +43,7 @@ class HeuristicProcessor(object):
self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
return '<!-- created by ebook-converter\'s pdftohtml -->' in src[:1000]
def is_abbyy(self, src):
return '<meta name="generator" content="ABBYY FineReader' in src[:1000]

View File

@@ -15,7 +15,7 @@ from ebook_converter.ebooks import parse_css_length
from ebook_converter.ebooks.docx.writer.utils import convert_color, int_or_zero
from ebook_converter.utils.localization import lang_as_iso639_1
from ebook_converter.polyglot.builtins import iteritems, filter, unicode_type
from tinycss.css21 import CSS21Parser
from ebook_converter.tinycss.css21 import CSS21Parser
css_parser = CSS21Parser()

View File

@@ -5,7 +5,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from tinycss.color3 import parse_color_string
from ebook_converter.tinycss.color3 import parse_color_string
def int_or_zero(raw):

View File

@@ -7,17 +7,17 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, subprocess, shutil, re
from functools import partial
from calibre import prints
from calibre.constants import iswindows, ispy3
from calibre.ptempfile import TemporaryDirectory
from calibre.ebooks.metadata import (
from ebook_converter import prints
from ebook_converter.constants import iswindows, ispy3
from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.ebooks.metadata import (
MetaInformation, string_to_authors, check_isbn, check_doi)
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
from polyglot.builtins import iteritems, unicode_type
from ebook_converter.utils.ipc.simple_worker import fork_job, WorkerError
from ebook_converter.polyglot.builtins import iteritems, unicode_type
def get_tools():
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
from ebook_converter.ebooks.pdf.pdftohtml import PDFTOHTML
base = os.path.dirname(PDFTOHTML)
suffix = '.exe' if iswindows else ''
pdfinfo = os.path.join(base, 'pdfinfo') + suffix
@@ -32,12 +32,14 @@ def read_info(outputdir, get_cover):
way to pass unicode paths via command line arguments. This also ensures
that if poppler crashes, no stale file handles are left for the original
file, only for src.pdf.'''
os.chdir(outputdir)
pdfinfo, pdftoppm = get_tools()
source_file = os.path.join(outputdir, 'src.pdf')
cover_file = os.path.join(outputdir, 'cover')
ans = {}
try:
raw = subprocess.check_output([pdfinfo, '-enc', 'UTF-8', '-isodates', 'src.pdf'])
raw = subprocess.check_output([pdfinfo, '-enc', 'UTF-8', '-isodates',
source_file])
except subprocess.CalledProcessError as e:
prints('pdfinfo errored out with return code: %d'%e.returncode)
return None
@@ -61,7 +63,7 @@ def read_info(outputdir, get_cover):
# https://cgit.freedesktop.org/poppler/poppler/commit/?id=c91483aceb1b640771f572cb3df9ad707e5cad0d
# we can no longer rely on it.
try:
raw = subprocess.check_output([pdfinfo, '-meta', 'src.pdf']).strip()
raw = subprocess.check_output([pdfinfo, '-meta', source_file]).strip()
except subprocess.CalledProcessError as e:
prints('pdfinfo failed to read XML metadata with return code: %d'%e.returncode)
else:
@@ -74,8 +76,8 @@ def read_info(outputdir, get_cover):
if get_cover:
try:
subprocess.check_call([pdftoppm, '-singlefile', '-jpeg', '-cropbox',
'src.pdf', 'cover'])
subprocess.check_call([pdftoppm, '-singlefile', '-jpeg',
'-cropbox', source_file, cover_file])
except subprocess.CalledProcessError as e:
prints('pdftoppm errored out with return code: %d'%e.returncode)
@@ -114,17 +116,7 @@ def get_metadata(stream, cover=True):
stream.seek(0)
with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
shutil.copyfileobj(stream, f)
try:
res = fork_job('calibre.ebooks.metadata.pdf', 'read_info',
(pdfpath, bool(cover)))
except WorkerError as e:
prints(e.orig_tb)
raise RuntimeError('Failed to run pdfinfo')
info = res['result']
with open(res['stdout_stderr'], 'rb') as f:
raw = f.read().strip()
if raw:
prints(raw)
info = read_info(pdfpath, bool(cover))
if info is None:
raise ValueError('Could not read info dict from PDF')
covpath = os.path.join(pdfpath, 'cover.jpg')
@@ -140,8 +132,6 @@ def get_metadata(stream, cover=True):
else:
au = string_to_authors(au)
mi = MetaInformation(title, au)
# if isbn is not None:
# mi.isbn = isbn
creator = info.get('Creator', None)
if creator:
@@ -161,7 +151,7 @@ def get_metadata(stream, cover=True):
mi.tags.insert(0, subject)
if 'xmp_metadata' in info:
from calibre.ebooks.metadata.xmp import consolidate_metadata
from ebook_converter.ebooks.metadata.xmp import consolidate_metadata
mi = consolidate_metadata(mi, info)
# Look for recognizable identifiers in the info dict, if they were not
@@ -182,9 +172,8 @@ def get_metadata(stream, cover=True):
get_quick_metadata = partial(get_metadata, cover=False)
from calibre.utils.podofo import set_metadata as podofo_set_metadata
#from ebook_converter.utils.podofo import set_metadata as podofo_set_metadata
def set_metadata(stream, mi):
stream.seek(0)
return podofo_set_metadata(stream, mi)
return None

View File

@@ -12,14 +12,14 @@ from collections import defaultdict
from lxml import etree
from lxml.builder import ElementMaker
from calibre import prints
from calibre.ebooks.metadata import check_isbn, check_doi
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.opf2 import dump_dict
from calibre.utils.date import parse_date, isoformat, now
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from polyglot.builtins import iteritems, string_or_bytes, filter
from ebook_converter import prints
from ebook_converter.ebooks.metadata import check_isbn, check_doi
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.ebooks.metadata.book.base import Metadata
from ebook_converter.ebooks.metadata.opf2 import dump_dict
from ebook_converter.utils.date import parse_date, isoformat, now
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
from ebook_converter.polyglot.builtins import iteritems, string_or_bytes, filter
_xml_declaration = re.compile(r'<\?xml[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE)
@@ -173,8 +173,8 @@ def read_series(root):
def read_user_metadata(mi, root):
from calibre.utils.config import from_json
from calibre.ebooks.metadata.book.json_codec import decode_is_multiple
from ebook_converter.utils.config import from_json
from ebook_converter.ebooks.metadata.book.json_codec import decode_is_multiple
fields = set()
for item in XPath('//calibre:custom_metadata')(root):
for li in XPath('./rdf:Bag/rdf:li')(item):
@@ -436,8 +436,8 @@ def create_series(calibre, series, series_index):
def create_user_metadata(calibre, all_user_metadata):
from calibre.utils.config import to_json
from calibre.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple
from ebook_converter.utils.config import to_json
from ebook_converter.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple
s = calibre.makeelement(expand('calibre:custom_metadata'))
calibre.append(s)
@@ -640,7 +640,7 @@ def merge_xmp_packet(old, new):
if __name__ == '__main__':
from calibre.utils.podofo import get_xmp_metadata
from ebook_converter.utils.podofo import get_xmp_metadata
xmp_packet = get_xmp_metadata(sys.argv[-1])
mi = metadata_from_xmp_packet(xmp_packet)
np = metadata_to_xmp_packet(mi)

View File

@@ -14,7 +14,7 @@ from ebook_converter.utils.img import save_cover_data_to, scale_image, image_to_
from ebook_converter.utils.imghdr import what
from ebook_converter.ebooks import normalize
from ebook_converter.polyglot.builtins import unicode_type, range, as_bytes, map
from tinycss.color3 import parse_color_string
from ebook_converter.tinycss.color3 import parse_color_string
IMAGE_MAX_SIZE = 10 * 1024 * 1024
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))

View File

@@ -500,7 +500,6 @@ class Style(object):
background shortcut properties. Note that inheritance/default values
are not used. None is returned if no background color is set.
'''
def validate_color(col):
return cssprofiles.validateWithProfile('color',
col,

View File

@@ -98,7 +98,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
with lopen(index, 'r+b') as i:
raw = i.read().decode('utf-8', 'replace')
raw = flip_images(raw)
raw = raw.replace('<head', '<!-- created by ebook_converter\'s pdftohtml -->\n <head', 1)
raw = raw.replace('<head', '<!-- created by ebook-converter\'s pdftohtml -->\n <head', 1)
i.seek(0)
i.truncate()
# versions of pdftohtml >= 0.20 output self closing <br> tags, this

View File

@@ -12,9 +12,9 @@
from .version import VERSION
__version__ = VERSION
from tinycss.css21 import CSS21Parser
from tinycss.page3 import CSSPage3Parser
from tinycss.fonts3 import CSSFonts3Parser
from ebook_converter.tinycss.css21 import CSS21Parser
from ebook_converter.tinycss.page3 import CSSPage3Parser
from ebook_converter.tinycss.fonts3 import CSSFonts3Parser
from ebook_converter.tinycss.media3 import CSSMedia3Parser

View File

@@ -13,10 +13,10 @@
from __future__ import unicode_literals
from itertools import chain, islice
from tinycss.decoding import decode
from tinycss.token_data import TokenList
from tinycss.tokenizer import tokenize_grouped
from tinycss.parsing import (
from ebook_converter.tinycss.decoding import decode
from ebook_converter.tinycss.token_data import TokenList
from ebook_converter.tinycss.tokenizer import tokenize_grouped
from ebook_converter.tinycss.parsing import (
strip_whitespace, remove_whitespace, split_on_comma, validate_value,
validate_any, ParseError)

View File

@@ -15,7 +15,7 @@ from __future__ import unicode_literals
import operator
import re
from polyglot.binary import from_hex_bytes
from ebook_converter.polyglot.binary import from_hex_bytes
__all__ = ['decode'] # Everything else is implementation detail

View File

@@ -8,8 +8,8 @@ __copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from ebook_converter.polyglot.builtins import map
from tinycss.css21 import CSS21Parser, ParseError
from tinycss.tokenizer import tokenize_grouped
from ebook_converter.tinycss.css21 import CSS21Parser, ParseError
from ebook_converter.tinycss.tokenizer import tokenize_grouped
def parse_font_family_tokens(tokens):

View File

@@ -5,8 +5,8 @@ from __future__ import absolute_import, division, print_function, unicode_litera
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from tinycss.css21 import CSS21Parser
from tinycss.parsing import remove_whitespace, split_on_comma, ParseError
from ebook_converter.tinycss.css21 import CSS21Parser
from ebook_converter.tinycss.parsing import remove_whitespace, split_on_comma, ParseError
from ebook_converter.polyglot.builtins import error_message

View File

@@ -14,7 +14,7 @@
from __future__ import unicode_literals
from tinycss import token_data
from ebook_converter.tinycss import token_data
def tokenize_flat(css_source, ignore_comments=True,