ebook-converter/ebook_converter/ebooks/conversion/plugins/djvu_input.py

# -*- coding: utf-8 -*-

from __future__ import absolute_import, division, print_function, unicode_literals

__license__ = 'GPL 3'
__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
__docformat__ = 'restructuredtext en'

import os
from io import BytesIO

from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.polyglot.builtins import getcwd


class DJVUInput(InputFormatPlugin):

    name        = 'DJVU Input'
    author      = 'Anthon van der Neut'
    description = 'Convert OCR-ed DJVU files (.djvu) to HTML'
    file_types  = {'djvu', 'djv'}
    commit_name = 'djvu_input'

    def convert(self, stream, options, file_ext, log, accelerators):
        from ebook_converter.ebooks.txt.processor import convert_basic

        stdout = BytesIO()
        from ebook_converter.ebooks.djvu.djvu import DJVUFile
        x = DJVUFile(stream)
        x.get_text(stdout)
        raw_text = stdout.getvalue()
        if not raw_text:
            raise ValueError('The DJVU file contains no text, only images, probably page scans.'
                    ' calibre only supports conversion of DJVU files with actual text in them.')

        html = convert_basic(raw_text.replace(b"\n", b' ').replace(
            b'\037', b'\n\n'))
        # Run the HTMLized text through the html processing plugin.
        from ebook_converter.customize.ui import plugin_for_input_format
        html_input = plugin_for_input_format('html')
        for opt in html_input.options:
            setattr(options, opt.option.name, opt.recommended_value)
        options.input_encoding = 'utf-8'
        base = getcwd()
        htmlfile = os.path.join(base, 'index.html')
        c = 0
        while os.path.exists(htmlfile):
            c += 1
            htmlfile = os.path.join(base, 'index%d.html'%c)
        with open(htmlfile, 'wb') as f:
            f.write(html.encode('utf-8'))
        odi = options.debug_pipeline
        options.debug_pipeline = None
        # Generate oeb from html conversion.
        with open(htmlfile, 'rb') as f:
            oeb = html_input.convert(f, options, 'html', log,
                {})
        options.debug_pipeline = odi
        os.remove(htmlfile)

        # Set metadata from file.
        from ebook_converter.customize.ui import get_file_type_metadata
        from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        mi = get_file_type_metadata(stream, file_ext)
        meta_info_to_oeb_metadata(mi, oeb.metadata, log)

        return oeb