diff --git a/ebook_converter/ebooks/oeb/transforms/unsmarten.py b/ebook_converter/ebooks/oeb/transforms/unsmarten.py new file mode 100644 index 0000000..d2fd09c --- /dev/null +++ b/ebook_converter/ebooks/oeb/transforms/unsmarten.py @@ -0,0 +1,28 @@ +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +from ebook_converter.ebooks.oeb.base import OEB_DOCS, XPath +from ebook_converter.ebooks.oeb.parse_utils import barename +from ebook_converter.utils.unsmarten import unsmarten_text + + +class UnsmartenPunctuation: + + def __init__(self): + self.html_tags = XPath('descendant::h:*') + + def unsmarten(self, root): + for x in self.html_tags(root): + if not barename(x.tag) == 'pre': + if getattr(x, 'text', None): + x.text = unsmarten_text(x.text) + if getattr(x, 'tail', None) and x.tail: + x.tail = unsmarten_text(x.tail) + + def __call__(self, oeb, context): + bx = XPath('//h:body') + for x in oeb.manifest.items: + if x.media_type in OEB_DOCS: + for body in bx(x.data): + self.unsmarten(body) diff --git a/ebook_converter/utils/unsmarten.py b/ebook_converter/utils/unsmarten.py new file mode 100644 index 0000000..2a3c093 --- /dev/null +++ b/ebook_converter/utils/unsmarten.py @@ -0,0 +1,40 @@ +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +from ebook_converter.utils.mreplace import MReplace + +_mreplace = MReplace({ + '–': '--', + '–': '--', + '–': '--', + '—': '---', + '—': '---', + '—': '---', + '…': '...', + '…': '...', + '…': '...', + '“': '"', + '”': '"', + '„': '"', + '″': '"', + '“': '"', + '”': '"', + '„': '"', + '″': '"', + '“':'"', + '”':'"', + '„':'"', + '″':'"', + '‘':"'", + '’':"'", + '′':"'", + '‘':"'", + '’':"'", + '′':"'", + '‘':"'", + '’':"'", + '′':"'", +}) + +unsmarten_text = _mreplace.mreplace