font|span|[ibu])[^>]*>)?\s*")
self.line_close = "((?P=inner3)>)?\\s*((?P=inner2)>)?\\s*((?P=inner1)>)?\\s*(?P=outer)>"
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*(p|div)>)', re.IGNORECASE)
self.scene_break_open = ''
self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\…\\)„\\w]'
self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
def is_pdftohtml(self, src):
return '' in src[:1000]
def is_abbyy(self, src):
return ''+chap+'\n'
else:
delete_whitespace = re.compile('^\\s*(?P.*?)\\s*$')
delete_quotes = re.compile('\'\"')
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g', html2text(chap)))
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g', html2text(title)))
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log.debug("marked %s chapters & titles. - %s, %s",
self.html_preprocess_sections, chap, title)
return ''+chap+'
\n'+title+'
\n'
def chapter_break(self, match):
chap = match.group('section')
styles = match.group('styles')
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log.debug("marked %s section markers based on punctuation. - %s",
self.html_preprocess_sections, chap)
return '<'+styles+' style="page-break-before:always">'+chap
def analyze_title_matches(self, match):
# chap = match.group('chap')
title = match.group('title')
if not title:
self.chapters_no_title = self.chapters_no_title + 1
else:
self.chapters_with_title = self.chapters_with_title + 1
def insert_indent(self, match):
pstyle = match.group('formatting')
tag = match.group('tagtype')
span = match.group('span')
self.found_indents = self.found_indents + 1
if pstyle:
if pstyle.lower().find('style') != -1:
pstyle = re.sub(r'"$', '; text-indent:3%"', pstyle)
else:
pstyle = pstyle+' style="text-indent:3%"'
if not span:
return '<'+tag+' '+pstyle+'>'
else:
return '<'+tag+' '+pstyle+'>'+span
else:
if not span:
return '<'+tag+' style="text-indent:3%">'
else:
return '<'+tag+' style="text-indent:3%">'+span
def no_markup(self, raw, percent):
'''
Detects total marked up line endings in the file. raw is the text to
inspect. Percent is the minimum percent of line endings which should
be marked up to return true.
'''
htm_end_ere = re.compile('(p|div)>', re.DOTALL)
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
htm_end = htm_end_ere.findall(raw)
line_end = line_end_ere.findall(raw)
tot_htm_ends = len(htm_end)
tot_ln_fds = len(line_end)
if percent > 1:
percent = 1
if percent < 0:
percent = 0
min_lns = tot_ln_fds * percent
return min_lns > tot_htm_ends
def dump(self, raw, where):
import os
dp = getattr(self.extra_opts, 'debug_pipeline', None)
if dp and os.path.exists(dp):
odir = os.path.join(dp, 'preprocess')
if not os.path.exists(odir):
os.makedirs(odir)
if os.path.exists(odir):
odir = os.path.join(odir, where)
if not os.path.exists(odir):
os.makedirs(odir)
name, i = None, 0
while not name or os.path.exists(os.path.join(odir, name)):
i += 1
name = '%04d.html'%i
with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8'))
def get_word_count(self, html):
word_count_text = re.sub(r'(?s)]*>.*?', '', html)
word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
wordcount = get_wordcount_obj(word_count_text)
return wordcount.words
def markup_italicis(self, html):
ITALICIZE_WORDS = [
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
'Mlle.', 'Mons.', 'PS.', 'PPS.',
]
ITALICIZE_STYLE_PATS = [
str(r'(?msu)(?<=[\s>"“\'‘])_\*/(?P[^\*_]+)/\*_'),
str(r'(?msu)(?<=[\s>"“\'‘])~~(?P[^~]+)~~'),
str(r'(?msu)(?<=[\s>"“\'‘])_/(?P[^/_]+)/_'),
str(r'(?msu)(?<=[\s>"“\'‘])_\*(?P[^\*_]+)\*_'),
str(r'(?msu)(?<=[\s>"“\'‘])\*/(?P[^/\*]+)/\*'),
str(r'(?msu)(?<=[\s>"“\'‘])/:(?P[^:/]+):/'),
str(r'(?msu)(?<=[\s>"“\'‘])\|:(?P[^:\|]+):\|'),
str(r'(?msu)(?<=[\s>"“\'‘])\*(?P[^\*]+)\*'),
str(r'(?msu)(?<=[\s>"“\'‘])~(?P[^~]+)~'),
str(r'(?msu)(?<=[\s>"“\'‘])/(?P[^/\*><]+)/'),
str(r'(?msu)(?<=[\s>"“\'‘])_(?P[^_]+)_'),
]
for word in ITALICIZE_WORDS:
html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '%s' % word, html)
search_text = re.sub(r'(?s)]*>.*?', '', html)
search_text = re.sub(r'<[^>]*>', '', search_text)
for pat in ITALICIZE_STYLE_PATS:
for match in re.finditer(pat, search_text):
ital_string = str(match.group('words'))
try:
html = re.sub(re.escape(str(match.group(0))), '%s' % ital_string, html)
except OverflowError:
# match.group(0) was too large to be compiled into a regex
continue
except re.error:
# the match was not a valid regular expression
continue
return html
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
'''
Searches for common chapter headings throughout the document
attempts multiple patterns based on likelihood of a match
with minimum false positives. Exits after finding a successful pattern
'''
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
# or pdf page numbers from being treated as TOC markers
max_chapters = 150
typical_chapters = 7000.
if wordcount > 7000:
if wordcount > 200000:
typical_chapters = 15000.
self.min_chapters = int(ceil(wordcount / typical_chapters))
self.log.debug("minimum chapters required are: %s", self.min_chapters)
heading = re.compile(']*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log.debug("found %s pre-existing headings",
self.html_preprocess_sections)
# Build the Regular Expressions in pieces
init_lookahead = "(?=<(p|div))"
chapter_line_open = self.line_open
title_line_open = (r"<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?"
r"\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*")
chapter_header_open = r"(?P"
title_header_open = r"(?P"
chapter_header_close = ")\\s*"
title_header_close = ")"
chapter_line_close = self.line_close
title_line_close = "((?P=inner6)>)?\\s*((?P=inner5)>)?\\s*((?P=inner4)>)?\\s*(?P=outer2)>"
is_pdftohtml = self.is_pdftohtml(html)
if is_pdftohtml:
title_line_open = "<(?Pp)[^>]*>\\s*"
title_line_close = "\\s*(?P=outer2)>"
if blanks_between_paragraphs:
blank_lines = "(\\s*]*>\\s*
){0,2}\\s*"
else:
blank_lines = ""
opt_title_open = "("
opt_title_close = ")?"
n_lookahead_open = "(?!\\s*"
n_lookahead_close = ")\\s*"
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?([ibu][^>]*>)?(?=<)"
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?([ibu][^>]*>)?(?=<)"
analysis_result = []
chapter_types = [
[(
r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)"
r"\s*([\d\w-]+\:?\'?\s*){0,5}"), True, True, True, False, "Searching for common section headings", 'common'],
# Highest frequency headings which include titles
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],
[r"]*>\s*(]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)()?\s*",
True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
[r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False,
"Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False,
"Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False,
"Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False,
"Searching for chapters with Uppercase Characters", 'uppercase'] # Uppercase Chapters
]
def recurse_patterns(html, analyze):
# Start with most typical chapter headings, get more aggressive until one works
for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
n_lookahead = ''
hits = 0
self.chapters_no_title = 0
self.chapters_with_title = 0
if n_lookahead_req:
lp_n_lookahead_open = n_lookahead_open
lp_n_lookahead_close = n_lookahead_close
else:
lp_n_lookahead_open = ''
lp_n_lookahead_close = ''
if strict_title:
lp_title = default_title
else:
lp_title = simple_title
if ignorecase:
arg_ignorecase = r'(?i)'
else:
arg_ignorecase = ''
if title_req:
lp_opt_title_open = ''
lp_opt_title_close = ''
else:
lp_opt_title_open = opt_title_open
lp_opt_title_close = opt_title_close
if self.html_preprocess_sections >= self.min_chapters:
break
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
if n_lookahead_req:
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
if not analyze:
self.log.debug("Marked %s headings, %s",
self.html_preprocess_sections, log_message)
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+ \
lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
chapdetect = re.compile(r'%s' % chapter_marker)
if analyze:
hits = len(chapdetect.findall(html))
if hits:
chapdetect.sub(self.analyze_title_matches, html)
if float(self.chapters_with_title) / float(hits) > .5:
title_req = True
strict_title = False
self.log.debug('%s had %s hits %s chapters with no '
'title, %s chapters with titles, %s '
'percent.', type_name, hits,
self.chapters_no_title,
self.chapters_with_title,
self.chapters_with_title / hits)
if type_name == 'common':
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
break
else:
html = chapdetect.sub(self.chapter_head, html)
return html
recurse_patterns(html, True)
chapter_types = analysis_result
html = recurse_patterns(html, False)
words_per_chptr = wordcount
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
words_per_chptr = wordcount // self.html_preprocess_sections
self.log.debug("Total wordcount is: %s, Average words per section "
"is: %s, Marked up %s chapters", wordcount,
words_per_chptr, self.html_preprocess_sections)
return html
def punctuation_unwrap(self, length, content, format):
'''
Unwraps lines based on line length and punctuation
supports a range of html markup and text files
the lookahead regex below is meant look for any non-full stop characters - punctuation
characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
the reason for this is to prevent false positive wrapping. False positives are more
difficult to detect than false negatives during a manual review of the doc
This function intentionally leaves hyphenated content alone as that is handled by the
dehyphenate routine in a separate step
'''
def style_unwrap(match):
style_close = match.group('style_close')
style_open = match.group('style_open')
if style_open and style_close:
return style_close+' '+style_open
elif style_open and not style_close:
return ' '+style_open
elif not style_open and style_close:
return style_close+' '
else:
return ' '
# define the pieces of the regex
# (?(span|[iub])>)?\\s*((p|div)>)?"
blanklines = "\\s*(?P<(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*(span|p|div)>\\s*)(span|p|div)>\\s*){0,3}\\s*"
line_opening = "<(p|div)[^>]*>\\s*(?P<(span|[iub])[^>]*>)?\\s*"
txt_line_wrap = "((\u0020|\u0009)*\n){1,4}"
if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
shy_unwrap_regex = soft_hyphen+txt_line_wrap
else:
unwrap_regex = lookahead+line_ending+blanklines+line_opening
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
unwrap = re.compile("%s" % unwrap_regex, re.UNICODE)
em_en_unwrap = re.compile("%s" % em_en_unwrap_regex, re.UNICODE)
shy_unwrap = re.compile("%s" % shy_unwrap_regex, re.UNICODE)
if format == 'txt':
content = unwrap.sub(' ', content)
content = em_en_unwrap.sub('', content)
content = shy_unwrap.sub('', content)
else:
content = unwrap.sub(style_unwrap, content)
content = em_en_unwrap.sub(style_unwrap, content)
content = shy_unwrap.sub(style_unwrap, content)
return content
def txt_process(self, match):
from ebook_converter.ebooks.txt.processor import convert_basic, separate_paragraphs_single_line
content = match.group('text')
content = separate_paragraphs_single_line(content)
content = convert_basic(content, epub_split_size_kb=0)
return content
def markup_pre(self, html):
pre = re.compile(r'', re.IGNORECASE)
if len(pre.findall(html)) >= 1:
self.log.debug("Running Text Processing")
outerhtml = re.compile(r'.*?(?<=)(?P.*?)
', re.IGNORECASE|re.DOTALL)
html = outerhtml.sub(self.txt_process, html)
from ebook_converter.ebooks.conversion.preprocess import convert_entities
html = re.sub(r'&(\S+?);', convert_entities, html)
else:
# Add markup naively
# TODO - find out if there are cases where there are more than one tag or
# other types of unmarked html and handle them in some better fashion
add_markup = re.compile('(?)(\n)')
html = add_markup.sub('
\n', html)
return html
def arrange_htm_line_endings(self, html):
html = re.sub(r"\s*(?Pp|div)>", ""+"\\g"+">\n", html)
html = re.sub(r"\s*<(?Pp|div)(?P