mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-19 16:25:55 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
236 lines
8.0 KiB
Python
236 lines
8.0 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=utf-8
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
import os
|
|
from ebook_converter.polyglot.builtins import itervalues, range
|
|
|
|
NBSP = '\xa0'
|
|
|
|
|
|
def mergeable(previous, current):
|
|
if previous.tail or current.tail:
|
|
return False
|
|
if previous.get('class', None) != current.get('class', None):
|
|
return False
|
|
if current.get('id', False):
|
|
return False
|
|
for attr in ('style', 'lang', 'dir'):
|
|
if previous.get(attr) != current.get(attr):
|
|
return False
|
|
try:
|
|
return next(previous.itersiblings()) is current
|
|
except StopIteration:
|
|
return False
|
|
|
|
|
|
def append_text(parent, text):
|
|
if len(parent) > 0:
|
|
parent[-1].tail = (parent[-1].tail or '') + text
|
|
else:
|
|
parent.text = (parent.text or '') + text
|
|
|
|
|
|
def merge(parent, span):
|
|
if span.text:
|
|
append_text(parent, span.text)
|
|
for child in span:
|
|
parent.append(child)
|
|
if span.tail:
|
|
append_text(parent, span.tail)
|
|
span.getparent().remove(span)
|
|
|
|
|
|
def merge_run(run):
|
|
parent = run[0]
|
|
for span in run[1:]:
|
|
merge(parent, span)
|
|
|
|
|
|
def liftable(css):
|
|
# A <span> is liftable if all its styling would work just as well if it is
|
|
# specified on the parent element.
|
|
prefixes = {x.partition('-')[0] for x in css}
|
|
return not (prefixes - {'text', 'font', 'letter', 'color', 'background'})
|
|
|
|
|
|
def add_text(elem, attr, text):
|
|
old = getattr(elem, attr) or ''
|
|
setattr(elem, attr, old + text)
|
|
|
|
|
|
def lift(span):
|
|
# Replace an element by its content (text, children and tail)
|
|
parent = span.getparent()
|
|
idx = parent.index(span)
|
|
try:
|
|
last_child = span[-1]
|
|
except IndexError:
|
|
last_child = None
|
|
|
|
if span.text:
|
|
if idx == 0:
|
|
add_text(parent, 'text', span.text)
|
|
else:
|
|
add_text(parent[idx - 1], 'tail', span.text)
|
|
|
|
for child in reversed(span):
|
|
parent.insert(idx, child)
|
|
parent.remove(span)
|
|
|
|
if span.tail:
|
|
if last_child is None:
|
|
if idx == 0:
|
|
add_text(parent, 'text', span.tail)
|
|
else:
|
|
add_text(parent[idx - 1], 'tail', span.tail)
|
|
else:
|
|
add_text(last_child, 'tail', span.tail)
|
|
|
|
|
|
def before_count(root, tag, limit=10):
|
|
body = root.xpath('//body[1]')
|
|
if not body:
|
|
return limit
|
|
ans = 0
|
|
for elem in body[0].iterdescendants():
|
|
if elem is tag:
|
|
return ans
|
|
ans += 1
|
|
if ans > limit:
|
|
return limit
|
|
|
|
|
|
def wrap_contents(tag_name, elem):
|
|
wrapper = elem.makeelement(tag_name)
|
|
wrapper.text, elem.text = elem.text, ''
|
|
for child in elem:
|
|
elem.remove(child)
|
|
wrapper.append(child)
|
|
elem.append(wrapper)
|
|
|
|
|
|
def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath):
|
|
# Apply vertical-align
|
|
for span in root.xpath('//span[@data-docx-vert]'):
|
|
wrap_contents(span.attrib.pop('data-docx-vert'), span)
|
|
|
|
# Move <hr>s outside paragraphs, if possible.
|
|
pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')))
|
|
for hr in root.xpath('//span/hr'):
|
|
p = pancestor(hr)
|
|
if p:
|
|
p = p[0]
|
|
descendants = tuple(p.iterdescendants())
|
|
if descendants[-1] is hr:
|
|
parent = p.getparent()
|
|
idx = parent.index(p)
|
|
parent.insert(idx+1, hr)
|
|
hr.tail = '\n\t'
|
|
|
|
# Merge consecutive spans that have the same styling
|
|
current_run = []
|
|
for span in root.xpath('//span'):
|
|
if not current_run:
|
|
current_run.append(span)
|
|
else:
|
|
last = current_run[-1]
|
|
if mergeable(last, span):
|
|
current_run.append(span)
|
|
else:
|
|
if len(current_run) > 1:
|
|
merge_run(current_run)
|
|
current_run = [span]
|
|
|
|
# Process dir attributes
|
|
class_map = dict(itervalues(styles.classes))
|
|
parents = ('p', 'div') + tuple('h%d' % i for i in range(1, 7))
|
|
for parent in root.xpath('//*[(%s)]' % ' or '.join('name()="%s"' % t for t in parents)):
|
|
# Ensure that children of rtl parents that are not rtl have an
|
|
# explicit dir set. Also, remove dir from children if it is the same as
|
|
# that of the parent.
|
|
if len(parent):
|
|
parent_dir = parent.get('dir')
|
|
for child in parent.iterchildren('span'):
|
|
child_dir = child.get('dir')
|
|
if parent_dir == 'rtl' and child_dir != 'rtl':
|
|
child_dir = 'ltr'
|
|
child.set('dir', child_dir)
|
|
if child_dir and child_dir == parent_dir:
|
|
child.attrib.pop('dir')
|
|
|
|
# Remove unnecessary span tags that are the only child of a parent block
|
|
# element
|
|
for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)):
|
|
if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None):
|
|
# We have a block whose contents are entirely enclosed in a <span>
|
|
span = parent[0]
|
|
span_class = span.get('class', None)
|
|
span_css = class_map.get(span_class, {})
|
|
span_dir = span.get('dir')
|
|
if liftable(span_css) and (not span_dir or span_dir == parent.get('dir')):
|
|
pclass = parent.get('class', None)
|
|
if span_class:
|
|
pclass = (pclass + ' ' + span_class) if pclass else span_class
|
|
parent.set('class', pclass)
|
|
parent.text = span.text
|
|
parent.remove(span)
|
|
if span.get('lang'):
|
|
parent.set('lang', span.get('lang'))
|
|
if span.get('dir'):
|
|
parent.set('dir', span.get('dir'))
|
|
for child in span:
|
|
parent.append(child)
|
|
|
|
# Make spans whose only styling is bold or italic into <b> and <i> tags
|
|
for span in root.xpath('//span[@class and not(@style)]'):
|
|
css = class_map.get(span.get('class', None), {})
|
|
if len(css) == 1:
|
|
if css == {'font-style':'italic'}:
|
|
span.tag = 'i'
|
|
del span.attrib['class']
|
|
elif css == {'font-weight':'bold'}:
|
|
span.tag = 'b'
|
|
del span.attrib['class']
|
|
|
|
# Get rid of <span>s that have no styling
|
|
for span in root.xpath('//span[not(@class or @id or @style or @lang or @dir)]'):
|
|
lift(span)
|
|
|
|
# Convert <p><br style="page-break-after:always"> </p> style page breaks
|
|
# into something the viewer will render as a page break
|
|
for p in root.xpath('//p[br[@style="page-break-after:always"]]'):
|
|
if len(p) == 1 and (not p[0].tail or not p[0].tail.strip()):
|
|
p.remove(p[0])
|
|
prefix = p.get('style', '')
|
|
if prefix:
|
|
prefix += '; '
|
|
p.set('style', prefix + 'page-break-after:always')
|
|
p.text = NBSP if not p.text else p.text
|
|
|
|
if detect_cover:
|
|
# Check if the first image in the document is possibly a cover
|
|
img = root.xpath('//img[@src][1]')
|
|
if img:
|
|
img = img[0]
|
|
path = os.path.join(dest_dir, img.get('src'))
|
|
if os.path.exists(path) and before_count(root, img, limit=10) < 5:
|
|
from ebook_converter.utils.imghdr import identify
|
|
try:
|
|
with lopen(path, 'rb') as imf:
|
|
fmt, width, height = identify(imf)
|
|
except:
|
|
width, height, fmt = 0, 0, None # noqa
|
|
del fmt
|
|
try:
|
|
is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
|
|
except ZeroDivisionError:
|
|
is_cover = False
|
|
if is_cover:
|
|
log.debug('Detected an image that looks like a cover')
|
|
img.getparent().remove(img)
|
|
return path
|