Fixing leftovers from first concept of constants

2026-04-12 08:03:33 +02:00 · 2020-06-07 11:59:00 +02:00
parent 7419954e0c
commit a69884d724
9 changed files with 652 additions and 464 deletions
--- a/ebook_converter/ebooks/mobi/reader/mobi8.py
+++ b/ebook_converter/ebooks/mobi/reader/mobi8.py
@@ -8,37 +8,28 @@ import uuid

 from lxml import etree

-from ebook_converter import constants as const
 from ebook_converter.ebooks.mobi.reader.headers import NULL_INDEX
 from ebook_converter.ebooks.mobi.reader.index import read_index
 from ebook_converter.ebooks.mobi.reader.ncx import read_ncx, build_toc
 from ebook_converter.ebooks.mobi.reader.markup import expand_mobi8_markup
-from ebook_converter.ebooks.mobi.reader.containers import Container, find_imgtype
+from ebook_converter.ebooks.mobi.reader import containers
 from ebook_converter.ebooks.metadata.opf2 import Guide, OPFCreator
 from ebook_converter.ebooks.metadata.toc import TOC
 from ebook_converter.ebooks.mobi.utils import read_font_record
 from ebook_converter.ebooks.oeb.parse_utils import parse_html
-from ebook_converter.ebooks.oeb.base import XPath, xml2text
+from ebook_converter.ebooks.oeb import base
 from ebook_converter.polyglot.builtins import as_unicode

+ID_RE = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
+NAME_RE = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
+AID_RE = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
+Part = collections.namedtuple('Part', 'num type filename start end aid')
+Elem = collections.namedtuple('Elem', 'insert_pos toc_text file_number '
+                              'sequence_number start_pos length')
+FlowInfo = collections.namedtuple('FlowInfo', 'type format dir fname')

-__license__ = 'GPL v3'
-__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
-__docformat__ = 'restructuredtext en'
-
-Part = collections.namedtuple('Part',
-    'num type filename start end aid')
-
-Elem = collections.namedtuple('Elem',
-    'insert_pos toc_text file_number sequence_number start_pos '
-    'length')
-
-FlowInfo = collections.namedtuple('FlowInfo',
-        'type format dir fname')

 # locate beginning and ending positions of tag with specific aid attribute
-
-
 def locate_beg_end_of_tag(ml, aid):
    pattern = br'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid
    aid_pattern = re.compile(pattern, re.IGNORECASE)
@@ -64,7 +55,8 @@ def reverse_tag_iter(block):
        end = plt


-def get_first_resource_index(first_image_index, num_of_text_records, first_text_record_number):
+def get_first_resource_index(first_image_index, num_of_text_records,
+                             first_text_record_number):
    first_resource_index = first_image_index
    if first_resource_index in {-1, NULL_INDEX}:
        first_resource_index = num_of_text_records + first_text_record_number
@@ -78,23 +70,27 @@ class Mobi8Reader(object):
        self.mobi6_reader, self.log = mobi6_reader, log
        self.header = mobi6_reader.book_header
        self.encrypted_fonts = []
-        self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''')
-        self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''')
-        self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''')
+        self.id_re = ID_RE
+        self.name_re = NAME_RE
+        self.aid_re = AID_RE

    def __call__(self):
        self.mobi6_reader.check_for_drm()
        self.aid_anchor_suffix = uuid.uuid4().hex.encode('utf-8')
        bh = self.mobi6_reader.book_header
+        _gfri = get_first_resource_index
        if self.mobi6_reader.kf8_type == 'joint':
            offset = self.mobi6_reader.kf8_boundary + 2
-            self.resource_offsets = [
-                (get_first_resource_index(bh.first_image_index, bh.mobi6_records, 1), offset - 2),
-                (get_first_resource_index(bh.kf8_first_image_index, bh.records, offset), len(self.mobi6_reader.sections)),
-            ]
+            self.resource_offsets = [(_gfri(bh.first_image_index,
+                                            bh.mobi6_records, 1), offset - 2),
+                                     (_gfri(bh.kf8_first_image_index,
+                                            bh.records, offset),
+                                      len(self.mobi6_reader.sections))]
        else:
            offset = 1
-            self.resource_offsets = [(get_first_resource_index(bh.first_image_index, bh.records, offset), len(self.mobi6_reader.sections))]
+            self.resource_offsets = [(_gfri(bh.first_image_index, bh.records,
+                                            offset),
+                                      len(self.mobi6_reader.sections))]

        self.processed_records = self.mobi6_reader.extract_text(offset=offset)
        self.raw_ml = self.mobi6_reader.mobi_html
@@ -123,37 +119,37 @@ class Mobi8Reader(object):
                raise ValueError('KF8 does not have a valid FDST record')
            sec_start, num_sections = struct.unpack_from(b'>LL', header, 4)
            secs = struct.unpack_from(b'>%dL' % (num_sections*2),
-                    header, sec_start)
+                                      header, sec_start)
            self.flow_table = tuple(zip(secs[::2], secs[1::2]))

        self.files = []
        if self.header.skelidx != NULL_INDEX:
            table = read_index(self.kf8_sections, self.header.skelidx,
-                    self.header.codec)[0]
-            File = collections.namedtuple('File',
-                'file_number name divtbl_count start_position length')
+                               self.header.codec)[0]
+            File = collections.namedtuple('File', 'file_number name '
+                                          'divtbl_count start_position length')

            for i, text in enumerate(table):
                tag_map = table[text]
                self.files.append(File(i, text, tag_map[1][0],
-                    tag_map[6][0], tag_map[6][1]))
+                                       tag_map[6][0], tag_map[6][1]))

        self.elems = []
        if self.header.dividx != NULL_INDEX:
            table, cncx = read_index(self.kf8_sections, self.header.dividx,
-                    self.header.codec)
+                                     self.header.codec)
            for i, text in enumerate(table):
                tag_map = table[text]
                toc_text = cncx[tag_map[2][0]]
                self.elems.append(Elem(int(text), toc_text, tag_map[3][0],
-                    tag_map[4][0], tag_map[6][0], tag_map[6][1]))
+                                       tag_map[4][0], tag_map[6][0],
+                                       tag_map[6][1]))

        self.guide = []
        if self.header.othidx != NULL_INDEX:
            table, cncx = read_index(self.kf8_sections, self.header.othidx,
-                    self.header.codec)
-            Item = collections.namedtuple('Item',
-                'type title pos_fid')
+                                     self.header.codec)
+            Item = collections.namedtuple('Item', 'type title pos_fid')

            for i, ref_type in enumerate(table):
                tag_map = table[ref_type]
@@ -161,7 +157,7 @@ class Mobi8Reader(object):
                title = cncx[tag_map[1][0]]
                fileno = None
                if 3 in list(tag_map.keys()):
-                    fileno  = tag_map[3][0]
+                    fileno = tag_map[3][0]
                if 6 in list(tag_map.keys()):
                    fileno = tag_map[6]
                if isinstance(ref_type, bytes):
@@ -205,17 +201,19 @@ class Mobi8Reader(object):
                head = skeleton[:insertpos]
                tail = skeleton[insertpos:]
                if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') <
-                    head.rfind(b'<')):
+                        head.rfind(b'<')):
                    # There is an incomplete tag in either the head or tail.
                    # This can happen for some badly formed KF8 files, see for
                    # example, https://bugs.launchpad.net/bugs/1082669
                    if not inspos_warned:
-                        self.log.warn(
-                            'The div table for %s has incorrect insert '
-                            'positions. Calculating manually.'%skelname)
+                        self.log.warn('The div table for %s has incorrect '
+                                      'insert positions. Calculating '
+                                      'manually.' % skelname)
                        inspos_warned = True
                    bp, ep = locate_beg_end_of_tag(skeleton, aidtext if
-                        isinstance(aidtext, bytes) else aidtext.encode('utf-8'))
+                                                   isinstance(aidtext, bytes)
+                                                   else
+                                                   aidtext.encode('utf-8'))
                    if bp != ep:
                        insertpos = ep + 1 + startpos

@@ -228,7 +226,7 @@ class Mobi8Reader(object):
                aidtext = str(uuid.uuid4())
                filename = aidtext + '.html'
            self.partinfo.append(Part(skelnum, 'text', filename, skelpos,
-                baseptr, aidtext))
+                                      baseptr, aidtext))

        # The primary css style sheet is typically stored next followed by any
        # snippets of code that were previously inlined in the
@@ -238,10 +236,10 @@ class Mobi8Reader(object):
        # The problem is that for most browsers and ereaders, you can not
        # use <img src="imageXXXX.svg" /> to import any svg image that itself
        # properly uses an <image/> tag to import some raster image - it
-        # should work according to the spec but does not for almost all browsers
-        # and ereaders and causes epub validation issues because those  raster
-        # images are in manifest but not in xhtml text - since they only
-        # referenced from an svg image
+        # should work according to the spec but does not for almost all
+        # browsers and ereaders and causes epub validation issues because
+        # those  raster images are in manifest but not in xhtml text - since
+        # they only referenced from an svg image

        # So we need to check the remaining flow pieces to see if they are css
        # or svg images.  if svg images, we must check if they have an <image/>
@@ -252,7 +250,8 @@ class Mobi8Reader(object):

        self.flowinfo.append(FlowInfo(None, None, None, None))
        svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE)
-        image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''', re.IGNORECASE)
+        image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''',
+                                       re.IGNORECASE)
        for j in range(1, len(self.flows)):
            flowpart = self.flows[j]
            nstr = '%04d' % j
@@ -276,7 +275,8 @@ class Mobi8Reader(object):
                # search for CDATA and if exists inline it
                if flowpart.find(b'[CDATA[') >= 0:
                    typ = 'css'
-                    flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n'
+                    flowpart = (b'<style type="text/css">\n' + flowpart +
+                                b'\n</style>\n')
                    format = 'inline'
                    dir = None
                    fname = None
@@ -300,7 +300,8 @@ class Mobi8Reader(object):

    def get_id_tag_by_pos_fid(self, posfid, offset):
        # first convert kindle:pos:fid and offset info to position in file
-        insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid]
+        (insertpos, idtext, filenum,
+         seqnm, startpos, length) = self.elems[posfid]
        pos = insertpos + offset
        fi = self.get_file_info(pos)
        # an existing "id=" must exist in original xhtml otherwise it would not
@@ -311,20 +312,20 @@ class Mobi8Reader(object):
        # so find the closest "id=" before position the file by actually
        # searching in that file
        idtext = self.get_id_tag(pos)
-        return '%s/%s'%(fi.type, fi.filename), idtext
+        return '%s/%s' % (fi.type, fi.filename), idtext

    def get_id_tag(self, pos):
        # Find the first tag with a named anchor (name or id attribute) before
        # pos
        fi = self.get_file_info(pos)
        if fi.num is None and fi.start is None:
-            raise ValueError('No file contains pos: %d'%pos)
+            raise ValueError('No file contains pos: %d' % pos)
        textblock = self.parts[fi.num]
        npos = pos - fi.start
        pgt = textblock.find(b'>', npos)
        plt = textblock.find(b'<', npos)
-        # if npos inside a tag then search all text before the its end of tag marker
-        # else not in a tag need to search the preceding tag
+        # if npos inside a tag then search all text before the its end of tag
+        # marker else not in a tag need to search the preceding tag
        if plt == npos or pgt < plt:
            npos = pgt + 1
        textblock = textblock[0:npos]
@@ -371,7 +372,7 @@ class Mobi8Reader(object):
                linktgt = fi.filename
                if idtext:
                    linktgt += '#' + idtext
-                g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwd())
+                g = Guide.Reference('%s/%s' % (fi.type, linktgt), os.getcwd())
                g.title, g.type = 'start', 'text'
                guide.append(g)

@@ -379,7 +380,7 @@ class Mobi8Reader(object):

    def create_ncx(self):
        index_entries = read_ncx(self.kf8_sections, self.header.ncxidx,
-                self.header.codec)
+                                 self.header.codec)
        remove = []

        # Add href and anchor info to the index entries
@@ -389,15 +390,15 @@ class Mobi8Reader(object):
                pos = entry['pos']
                fi = self.get_file_info(pos)
                if fi.filename is None:
-                    raise ValueError('Index entry has invalid pos: %d'%pos)
+                    raise ValueError('Index entry has invalid pos: %d' % pos)
                idtag = self.get_id_tag(pos)
-                href = '%s/%s'%(fi.type, fi.filename)
+                href = '%s/%s' % (fi.type, fi.filename)
            else:
                try:
                    href, idtag = self.get_id_tag_by_pos_fid(*pos_fid)
                except ValueError:
-                    self.log.warn('Invalid entry in NCX (title: %s), ignoring'
-                                  %entry['text'])
+                    self.log.warn('Invalid entry in NCX (title: %s), '
+                                  'ignoring' % entry['text'])
                    remove.append(entry)
                    continue

@@ -411,7 +412,8 @@ class Mobi8Reader(object):
        return build_toc(index_entries)

    def extract_resources(self, sections):
-        from ebook_converter.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF
+        from ebook_converter.ebooks.mobi.writer2.resources import \
+                PLACEHOLDER_GIF
        resource_map = []
        container = None
        for x in ('fonts', 'images'):
@@ -424,16 +426,18 @@ class Mobi8Reader(object):
                typ = data[:4]
                href = None
                if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN',
-                        b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET', b'PAGE'}:
+                           b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC',
+                           b'CMET', b'PAGE'}:
                    pass  # Ignore these records
                elif typ == b'FONT':
                    font = read_font_record(data)
                    href = "fonts/%05d.%s" % (fname_idx, font['ext'])
                    if font['err']:
-                        self.log.warn('Reading font record %d failed: %s'%(
-                            fname_idx, font['err']))
+                        self.log.warn('Reading font record %d failed: %s' %
+                                      (fname_idx, font['err']))
                        if font['headers']:
-                            self.log.debug('Font record headers: %s'%font['headers'])
+                            self.log.debug('Font record headers: %s' %
+                                           font['headers'])
                    with open(href.replace('/', os.sep), 'wb') as f:
                        f.write(font['font_data'] if font['font_data'] else
                                font['raw_data'])
@@ -443,19 +447,23 @@ class Mobi8Reader(object):
                    if data == b'CONTBOUNDARY':
                        container = None
                        continue
-                    container = Container(data)
+                    container = containers.Container(data)
                elif typ == b'CRES':
                    data, imgtype = container.load_image(data)
                    if data is not None:
-                        href = 'images/%05d.%s'%(container.resource_index, imgtype)
+                        href = 'images/%05d.%s' % (container.resource_index,
+                                                   imgtype)
                        with open(href.replace('/', os.sep), 'wb') as f:
                            f.write(data)
-                elif typ == b'\xa0\xa0\xa0\xa0' and len(data) == 4 and container is not None:
+                elif (typ == b'\xa0\xa0\xa0\xa0' and
+                      len(data) == 4 and
+                      container is not None):
                    container.resource_index += 1
                elif container is None:
-                    if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF):
-                        imgtype = find_imgtype(data)
-                        href = 'images/%05d.%s'%(fname_idx, imgtype)
+                    if not (len(data) == len(PLACEHOLDER_GIF) and
+                            data == PLACEHOLDER_GIF):
+                        imgtype = containers.find_imgtype(data)
+                        href = 'images/%05d.%s' % (fname_idx, imgtype)
                        with open(href.replace('/', os.sep), 'wb') as f:
                            f.write(data)

@@ -482,7 +490,7 @@ class Mobi8Reader(object):
                    if os.path.exists(href.replace('/', os.sep)):
                        try:
                            toc = self.read_inline_toc(href, frag)
-                        except:
+                        except Exception:
                            self.log.exception('Failed to read inline ToC')

        opf = OPFCreator(os.getcwd(), mi)
@@ -493,7 +501,8 @@ class Mobi8Reader(object):

        # If there are no images then the azw3 input plugin dumps all
        # binary records as .unknown images, remove them
-        if self.for_tweak and os.path.exists('images') and os.path.isdir('images'):
+        if (self.for_tweak and os.path.exists('images') and
+                os.path.isdir('images')):
            files = os.listdir('images')
            unknown = [x for x in files if x.endswith('.unknown')]
            if len(files) == len(unknown):
@@ -502,7 +511,7 @@ class Mobi8Reader(object):
        if self.for_tweak:
            try:
                os.remove('debug-raw.html')
-            except:
+            except Exception:
                pass

        opf.create_manifest_from_files_in([os.getcwd()], exclude=exclude)
@@ -528,7 +537,7 @@ class Mobi8Reader(object):
        with open(href.replace('/', os.sep), 'rb') as f:
            raw = f.read().decode(self.header.codec)
        root = parse_html(raw, log=self.log)
-        body = XPath('//h:body')(root)
+        body = base.XPath('//h:body')(root)
        reached = False
        if body:
            start = body[0]
@@ -536,7 +545,7 @@ class Mobi8Reader(object):
            start = None
            reached = True
        if frag:
-            elems = XPath('//*[@id="%s"]'%frag)(root)
+            elems = base.XPath('//*[@id="%s"]' % frag)(root)
            if elems:
                start = elems[0]

@@ -554,12 +563,13 @@ class Mobi8Reader(object):
        seen = set()
        links = []
        for elem in root.iterdescendants(etree.Element):
-            if reached and elem.tag == const.XHTML_A and elem.get('href',
+            if reached and elem.tag == base.tag('xhtml',
+                                                'a') and elem.get('href',
                                                                  False):
                href = elem.get('href')
                href, frag = urllib.parse.urldefrag(href)
                href = base_href + '/' + href
-                text = xml2text(elem).strip()
+                text = base.xml2text(elem).strip()
                if (text, href, frag) in seen:
                    continue
                seen.add((text, href, frag))
@@ -568,7 +578,7 @@ class Mobi8Reader(object):
                reached = True

        depths = sorted(set(x[-1] for x in links))
-        depth_map = {x:i for i, x in enumerate(depths)}
+        depth_map = {x: i for i, x in enumerate(depths)}
        for text, href, frag, depth in links:
            depth = depth_map[depth]
            if current_depth is None: