1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-23 05:15:45 +01:00
Files
ebook-converter/ebook_converter/ebooks/__init__.py
2020-07-17 18:59:45 +02:00

257 lines
8.1 KiB
Python

"""
Code for the conversion of ebook formats and the reading of metadata
from various formats.
"""
import numbers
import os
import re
from lxml import etree
from ebook_converter.ebooks.chardet import xml_to_unicode
class ConversionError(Exception):
def __init__(self, msg, only_msg=False):
Exception.__init__(self, msg)
self.only_msg = only_msg
class UnknownFormatError(Exception):
pass
class DRMError(ValueError):
pass
class ParserError(ValueError):
pass
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text',
'htm', 'xhtm', 'html', 'htmlz', 'xhtml', 'pdf', 'pdb',
'updb', 'pdr', 'prc', 'mobi', 'azw', 'doc', 'epub', 'fb2',
'fbz', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz',
'mbp', 'tan', 'snb', 'xps', 'oxps', 'azw4', 'book', 'zbf',
'pobi', 'docx', 'docm', 'md', 'textile', 'markdown',
'ibook', 'ibooks', 'iba', 'azw3', 'ps', 'kepub', 'kfx',
'kpf']
def return_raster_image(path):
from ebook_converter.utils.imghdr import what
if os.access(path, os.R_OK):
with open(path, 'rb') as f:
raw = f.read()
if what(None, raw) not in (None, 'svg'):
return raw
def extract_cover_from_embedded_svg(html, base, log):
from ebook_converter.ebooks.oeb.base import XPath, SVG, XLINK
root = etree.fromstring(html)
svg = XPath('//svg:svg')(root)
if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
image = svg[0][0]
href = image.get(XLINK('href'), None)
if href:
path = os.path.join(base, *href.split('/'))
return return_raster_image(path)
def extract_calibre_cover(raw, base, log):
from ebook_converter.ebooks.BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(raw)
matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
'font', 'br'])
images = soup.findAll('img', src=True)
if (matches is None and len(images) == 1 and
images[0].get('alt', '').lower() == 'cover'):
img = images[0]
img = os.path.join(base, *img['src'].split('/'))
q = return_raster_image(img)
if q is not None:
return q
# Look for a simple cover, i.e. a body with no text and only one <img> tag
if matches is None:
body = soup.find('body')
if body is not None:
text = u''.join(map(str, body.findAll(text=True)))
if text.strip():
# Body has text, abort
return
images = body.findAll('img', src=True)
if len(images) == 1:
img = os.path.join(base, *images[0]['src'].split('/'))
return return_raster_image(img)
def render_html_svg_workaround(path_to_html, log, width=590, height=750):
from ebook_converter.ebooks.oeb.base import SVG_NS
with open(path_to_html, 'rb') as f:
raw = f.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
data = None
if SVG_NS in raw:
try:
data = extract_cover_from_embedded_svg(
raw, os.path.dirname(path_to_html), log)
except Exception:
pass
if data is None:
try:
data = extract_calibre_cover(raw, os.path.dirname(path_to_html),
log)
except Exception:
pass
if data is None:
data = render_html_data(path_to_html, width, height)
return data
def render_html_data(path_to_html, width, height):
from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.utils.ipc.simple_worker import fork_job, WorkerError
result = {}
def report_error(text=''):
print(f'Failed to render {path_to_html}')
# file=sys.stderr)
if text:
print(text) # , file=sys.stderr)
if result and result['stdout_stderr']:
with open(result['stdout_stderr'], 'rb') as f:
print(f.read()) # , file=sys.stderr)
with TemporaryDirectory('-render-html') as tdir:
try:
result = fork_job('ebook_converter.ebooks.render_html', 'main',
args=(path_to_html, tdir, 'jpeg'))
except WorkerError as e:
report_error(e.orig_tb)
else:
if result['result']:
with open(os.path.join(tdir, 'rendered.jpeg'), 'rb') as f:
return f.read()
else:
report_error()
def check_ebook_format(stream, current_guess):
ans = current_guess
if current_guess.lower() in ('prc', 'mobi', 'azw', 'azw1', 'azw3'):
stream.seek(0)
if stream.read(3) == b'TPZ':
ans = 'tpz'
stream.seek(0)
return ans
def normalize(x):
if isinstance(x, str):
import unicodedata
x = unicodedata.normalize('NFC', x)
return x
def calibre_cover(title, author_string, series_string=None,
output_format='jpg', title_size=46, author_size=36,
logo_path=None):
# TODO(gryf): generate cover using pillow
return None
title = normalize(title)
author_string = normalize(author_string)
series_string = normalize(series_string)
from ebook_converter.ebooks.covers import calibre_cover2
from ebook_converter.utils.img import image_to_data
ans = calibre_cover2(title, author_string or '', series_string or '',
logo_path=logo_path, as_qimage=True)
return image_to_data(ans, fmt=output_format)
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc'
r'|rem|q)$')
def unit_convert(value, base, font, dpi, body_font_size=12):
' Return value in pts'
if isinstance(value, numbers.Number):
return value
try:
return float(value) * 72.0 / dpi
except Exception:
pass
result = value
m = UNIT_RE.match(value)
if m is not None and m.group(1):
value = float(m.group(1))
unit = m.group(2)
if unit == '%':
result = (value / 100.0) * base
elif unit == 'px':
result = value * 72.0 / dpi
elif unit == 'in':
result = value * 72.0
elif unit == 'pt':
result = value
elif unit == 'em':
result = value * font
elif unit in ('ex', 'en'):
# This is a hack for ex since we have no way to know
# the x-height of the font
font = font
result = value * font * 0.5
elif unit == 'pc':
result = value * 12.0
elif unit == 'mm':
result = value * 2.8346456693
elif unit == 'cm':
result = value * 28.346456693
elif unit == 'rem':
result = value * body_font_size
elif unit == 'q':
result = value * 0.708661417325
return result
def parse_css_length(value):
try:
m = UNIT_RE.match(value)
except TypeError:
return None, None
if m is not None and m.group(1):
value = float(m.group(1))
unit = m.group(2)
return value, unit.lower()
return None, None
def generate_masthead(title, output_path=None, width=600, height=60):
from ebook_converter.ebooks.conversion.config import load_defaults
recs = load_defaults('mobi_output')
masthead_font_family = recs.get('masthead_font', None)
from ebook_converter.ebooks.covers import generate_masthead
return generate_masthead(title, output_path=output_path, width=width,
height=height, font_family=masthead_font_family)
def escape_xpath_attr(value):
if '"' in value:
if "'" in value:
parts = re.split('("+)', value)
ans = []
for x in parts:
if x:
q = "'" if '"' in x else '"'
ans.append(q + x + q)
return 'concat(%s)' % ', '.join(ans)
else:
return "'%s'" % value
return '"%s"' % value