diff --git a/README.rst b/README.rst
index c204d6a..e657b11 100644
--- a/README.rst
+++ b/README.rst
@@ -53,6 +53,8 @@ Currently, I've tested following input formats:
- rtf
- mobi
- fb2
+- html
+- pdf
Note, that old Microsoft doc format is not supported, although old documents
can be fairly easy converted using text processors programs, lik Word or
@@ -68,6 +70,7 @@ Currently, following formats are supported:
- epub
- mobi
- docx
+- htmlz (zipped HTML file with additional assets, like images)
Installation
diff --git a/ebook_converter/ebooks/conversion/plugins/pdf_input.py b/ebook_converter/ebooks/conversion/plugins/pdf_input.py
index 3dd2f03..558abde 100644
--- a/ebook_converter/ebooks/conversion/plugins/pdf_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/pdf_input.py
@@ -41,8 +41,7 @@ class PDFInput(InputFormatPlugin):
PDFDocument(xml, self.opts, self.log)
return os.path.join(getcwd(), 'metadata.opf')
- def convert(self, stream, options, file_ext, log,
- accelerators):
+ def convert(self, stream, options, file_ext, log, accelerators):
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml
diff --git a/ebook_converter/ebooks/conversion/preprocess.py b/ebook_converter/ebooks/conversion/preprocess.py
index cc821c7..eab5560 100644
--- a/ebook_converter/ebooks/conversion/preprocess.py
+++ b/ebook_converter/ebooks/conversion/preprocess.py
@@ -471,7 +471,7 @@ class HTMLPreProcessor(object):
return re.search('
<]*id=BookTitle', raw) is not None
def is_pdftohtml(self, src):
- return '' in src[:1000]
+ return '' in src[:1000]
def __call__(self, html, remove_special_chars=None,
get_preprocess_html=False):
@@ -627,7 +627,7 @@ class HTMLPreProcessor(object):
html = preprocessor(html)
if is_pdftohtml:
- html = html.replace('', '')
+ html = html.replace('', '')
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = smarten_punctuation(html, self.log)
diff --git a/ebook_converter/ebooks/conversion/utils.py b/ebook_converter/ebooks/conversion/utils.py
index a53dad5..79a725d 100644
--- a/ebook_converter/ebooks/conversion/utils.py
+++ b/ebook_converter/ebooks/conversion/utils.py
@@ -43,7 +43,7 @@ class HeuristicProcessor(object):
self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
def is_pdftohtml(self, src):
- return '' in src[:1000]
+ return '' in src[:1000]
def is_abbyy(self, src):
return ']*>', re.IGNORECASE)
@@ -173,8 +173,8 @@ def read_series(root):
def read_user_metadata(mi, root):
- from calibre.utils.config import from_json
- from calibre.ebooks.metadata.book.json_codec import decode_is_multiple
+ from ebook_converter.utils.config import from_json
+ from ebook_converter.ebooks.metadata.book.json_codec import decode_is_multiple
fields = set()
for item in XPath('//calibre:custom_metadata')(root):
for li in XPath('./rdf:Bag/rdf:li')(item):
@@ -436,8 +436,8 @@ def create_series(calibre, series, series_index):
def create_user_metadata(calibre, all_user_metadata):
- from calibre.utils.config import to_json
- from calibre.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple
+ from ebook_converter.utils.config import to_json
+ from ebook_converter.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple
s = calibre.makeelement(expand('calibre:custom_metadata'))
calibre.append(s)
@@ -640,7 +640,7 @@ def merge_xmp_packet(old, new):
if __name__ == '__main__':
- from calibre.utils.podofo import get_xmp_metadata
+ from ebook_converter.utils.podofo import get_xmp_metadata
xmp_packet = get_xmp_metadata(sys.argv[-1])
mi = metadata_from_xmp_packet(xmp_packet)
np = metadata_to_xmp_packet(mi)
diff --git a/ebook_converter/ebooks/mobi/utils.py b/ebook_converter/ebooks/mobi/utils.py
index 7dd2943..4b08a91 100644
--- a/ebook_converter/ebooks/mobi/utils.py
+++ b/ebook_converter/ebooks/mobi/utils.py
@@ -14,7 +14,7 @@ from ebook_converter.utils.img import save_cover_data_to, scale_image, image_to_
from ebook_converter.utils.imghdr import what
from ebook_converter.ebooks import normalize
from ebook_converter.polyglot.builtins import unicode_type, range, as_bytes, map
-from tinycss.color3 import parse_color_string
+from ebook_converter.tinycss.color3 import parse_color_string
IMAGE_MAX_SIZE = 10 * 1024 * 1024
RECORD_SIZE = 0x1000 # 4096 (Text record size (uncompressed))
diff --git a/ebook_converter/ebooks/oeb/stylizer.py b/ebook_converter/ebooks/oeb/stylizer.py
index 0a2a553..16cba7c 100644
--- a/ebook_converter/ebooks/oeb/stylizer.py
+++ b/ebook_converter/ebooks/oeb/stylizer.py
@@ -500,7 +500,6 @@ class Style(object):
background shortcut properties. Note that inheritance/default values
are not used. None is returned if no background color is set.
'''
-
def validate_color(col):
return cssprofiles.validateWithProfile('color',
col,
diff --git a/ebook_converter/ebooks/pdf/pdftohtml.py b/ebook_converter/ebooks/pdf/pdftohtml.py
index 248e37c..40c95b3 100644
--- a/ebook_converter/ebooks/pdf/pdftohtml.py
+++ b/ebook_converter/ebooks/pdf/pdftohtml.py
@@ -98,7 +98,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
with lopen(index, 'r+b') as i:
raw = i.read().decode('utf-8', 'replace')
raw = flip_images(raw)
- raw = raw.replace('\n \n = 0.20 output self closing
tags, this
diff --git a/ebook_converter/tinycss/__init__.py b/ebook_converter/tinycss/__init__.py
index 0ec8afe..b994f53 100644
--- a/ebook_converter/tinycss/__init__.py
+++ b/ebook_converter/tinycss/__init__.py
@@ -12,9 +12,9 @@
from .version import VERSION
__version__ = VERSION
-from tinycss.css21 import CSS21Parser
-from tinycss.page3 import CSSPage3Parser
-from tinycss.fonts3 import CSSFonts3Parser
+from ebook_converter.tinycss.css21 import CSS21Parser
+from ebook_converter.tinycss.page3 import CSSPage3Parser
+from ebook_converter.tinycss.fonts3 import CSSFonts3Parser
from ebook_converter.tinycss.media3 import CSSMedia3Parser
diff --git a/ebook_converter/tinycss/css21.py b/ebook_converter/tinycss/css21.py
index 2f1d3f8..7581e8f 100644
--- a/ebook_converter/tinycss/css21.py
+++ b/ebook_converter/tinycss/css21.py
@@ -13,10 +13,10 @@
from __future__ import unicode_literals
from itertools import chain, islice
-from tinycss.decoding import decode
-from tinycss.token_data import TokenList
-from tinycss.tokenizer import tokenize_grouped
-from tinycss.parsing import (
+from ebook_converter.tinycss.decoding import decode
+from ebook_converter.tinycss.token_data import TokenList
+from ebook_converter.tinycss.tokenizer import tokenize_grouped
+from ebook_converter.tinycss.parsing import (
strip_whitespace, remove_whitespace, split_on_comma, validate_value,
validate_any, ParseError)
diff --git a/ebook_converter/tinycss/decoding.py b/ebook_converter/tinycss/decoding.py
index 32e1799..f8e86fe 100644
--- a/ebook_converter/tinycss/decoding.py
+++ b/ebook_converter/tinycss/decoding.py
@@ -15,7 +15,7 @@ from __future__ import unicode_literals
import operator
import re
-from polyglot.binary import from_hex_bytes
+from ebook_converter.polyglot.binary import from_hex_bytes
__all__ = ['decode'] # Everything else is implementation detail
diff --git a/ebook_converter/tinycss/fonts3.py b/ebook_converter/tinycss/fonts3.py
index d55e8dd..ee959f7 100644
--- a/ebook_converter/tinycss/fonts3.py
+++ b/ebook_converter/tinycss/fonts3.py
@@ -8,8 +8,8 @@ __copyright__ = '2014, Kovid Goyal '
import re
from ebook_converter.polyglot.builtins import map
-from tinycss.css21 import CSS21Parser, ParseError
-from tinycss.tokenizer import tokenize_grouped
+from ebook_converter.tinycss.css21 import CSS21Parser, ParseError
+from ebook_converter.tinycss.tokenizer import tokenize_grouped
def parse_font_family_tokens(tokens):
diff --git a/ebook_converter/tinycss/media3.py b/ebook_converter/tinycss/media3.py
index 9498c78..dff6499 100644
--- a/ebook_converter/tinycss/media3.py
+++ b/ebook_converter/tinycss/media3.py
@@ -5,8 +5,8 @@ from __future__ import absolute_import, division, print_function, unicode_litera
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal '
-from tinycss.css21 import CSS21Parser
-from tinycss.parsing import remove_whitespace, split_on_comma, ParseError
+from ebook_converter.tinycss.css21 import CSS21Parser
+from ebook_converter.tinycss.parsing import remove_whitespace, split_on_comma, ParseError
from ebook_converter.polyglot.builtins import error_message
diff --git a/ebook_converter/tinycss/tokenizer.py b/ebook_converter/tinycss/tokenizer.py
index 154c111..6e03c8b 100644
--- a/ebook_converter/tinycss/tokenizer.py
+++ b/ebook_converter/tinycss/tokenizer.py
@@ -14,7 +14,7 @@
from __future__ import unicode_literals
-from tinycss import token_data
+from ebook_converter.tinycss import token_data
def tokenize_flat(css_source, ignore_comments=True,