""" Read content from txt file. """ import os, re from ebook_converter import prepare_string_for_xml from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.conversion.preprocess import DocAnalysis from ebook_converter.utils.cleantext import clean_ascii_chars HTML_TEMPLATE = '%s \n%s\n' def clean_txt(txt): ''' Run transformations on the text to put it into consistent state. ''' if isinstance(txt, bytes): txt = txt.decode('utf-8', 'replace') # Strip whitespace from the end of the line. Also replace # all line breaks with \n. txt = '\n'.join([line.rstrip() for line in txt.splitlines()]) # Replace whitespace at the beginning of the line with   txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt) # Condense redundant spaces txt = re.sub('[ ]{2,}', ' ', txt) # Remove blank space from the beginning and end of the document. txt = re.sub(r'^\s+(?=.)', '', txt) txt = re.sub(r'(?<=.)\s+$', '', txt) # Remove excessive line breaks. txt = re.sub('\n{5,}', '\n\n\n\n', txt) # remove ASCII invalid chars : 0 to 8 and 11-14 to 24 txt = clean_ascii_chars(txt) return txt def split_txt(txt, epub_split_size_kb=0): ''' Ensure there are split points for converting to EPUB. A misdetected paragraph type can result in the entire document being one giant paragraph. In this case the EPUB parser will not be able to determine where to split the file to accommodate the EPUB file size limitation and will fail. ''' # Takes care if there is no point to split if epub_split_size_kb > 0: if isinstance(txt, str): txt = txt.encode('utf-8') length_byte = len(txt) # Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin) chunk_size = int(length_byte / (int(length_byte / (epub_split_size_kb * 1024)) + 2)) # if there are chunks with a superior size then go and break parts = txt.split(b'\n\n') lengths = tuple(map(len, parts)) if lengths and max(lengths) > chunk_size: txt = b'\n\n'.join([ split_string_separator(line, chunk_size) for line in parts ]) if isinstance(txt, bytes): txt = txt.decode('utf-8') return txt def convert_basic(txt, title='', epub_split_size_kb=0): ''' Converts plain text to html by putting all paragraphs in

tags. It condense and retains blank lines when necessary. Requires paragraphs to be in single line format. ''' txt = clean_txt(txt) txt = split_txt(txt, epub_split_size_kb) lines = [] blank_count = 0 # Split into paragraphs based on having a blank line between text. for line in txt.split('\n'): if line.strip(): blank_count = 0 lines.append(u'

%s

' % prepare_string_for_xml(line.replace('\n', ' '))) else: blank_count += 1 if blank_count == 2: lines.append(u'

 

') return HTML_TEMPLATE % (title, u'\n'.join(lines)) DEFAULT_MD_EXTENSIONS = ('footnotes', 'tables', 'toc') def create_markdown_object(extensions): # Need to load markdown extensions without relying on pkg_resources import importlib from ebook_converter.ebooks.markdown import Markdown from markdown import Extension class NotBrainDeadMarkdown(Markdown): def build_extension(self, ext_name, configs): if '.' in ext_name or ':' in ext_name: return Markdown.build_extension(self, ext_name, configs) ext_name = 'markdown.extensions.' + ext_name module = importlib.import_module(ext_name) if hasattr(module, 'makeExtension'): return module.makeExtension(**configs) for name, x in vars(module).items(): if type(x) is type and issubclass(x, Extension) and x is not Extension: return x(**configs) raise ImportError('No extension class in {}'.format(ext_name)) from ebook_converter.ebooks.conversion.plugins.txt_input import MD_EXTENSIONS extensions = [x.lower() for x in extensions] extensions = [x for x in extensions if x in MD_EXTENSIONS] md = NotBrainDeadMarkdown(extensions=extensions) return md def convert_markdown(txt, title='', extensions=DEFAULT_MD_EXTENSIONS): md = create_markdown_object(extensions) return HTML_TEMPLATE % (title, md.convert(txt)) def convert_markdown_with_metadata(txt, title='', extensions=DEFAULT_MD_EXTENSIONS): from ebook_converter.ebooks.metadata.book.base import Metadata from ebook_converter.utils.date import parse_only_date from ebook_converter.db.write import get_series_values if 'meta' not in extensions: extensions.append('meta') md = create_markdown_object(extensions) html = md.convert(txt) mi = Metadata(title or 'Unknown') m = md.Meta for k, v in {'date':'pubdate', 'summary':'comments'}.items(): if v not in m and k in m: m[v] = m.pop(k) for k in 'title authors series tags pubdate comments publisher rating'.split(): val = m.get(k) if val: mf = mi.metadata_for_field(k) if not mf.get('is_multiple'): val = val[0] if k == 'series': val, si = get_series_values(val) mi.series_index = 1 if si is None else si if k == 'rating': try: val = max(0, min(int(float(val)), 10)) except Exception: continue if mf.get('datatype') == 'datetime': try: val = parse_only_date(val, assume_utc=False) except Exception: continue setattr(mi, k, val) return mi, HTML_TEMPLATE % (mi.title, html) def convert_textile(txt, title=''): from ebook_converter.ebooks.textile.functions import textile html = textile(txt, encoding='utf-8') return HTML_TEMPLATE % (title, html) def normalize_line_endings(txt): txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') return txt def separate_paragraphs_single_line(txt): txt = txt.replace('\n', '\n\n') return txt def separate_paragraphs_print_formatted(txt): txt = re.sub(u'(?miu)^(?P\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt) return txt def separate_hard_scene_breaks(txt): def sep_break(line): if len(line.strip()) > 0: return '\n%s\n' % line else: return line txt = re.sub(r'(?miu)^[ \t-=~\/_]+$', lambda mo: sep_break(mo.group()), txt) return txt def block_to_single_line(txt): txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt) return txt def preserve_spaces(txt): ''' Replaces spaces multiple spaces with   entities. ''' txt = re.sub('(?P[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt) txt = txt.replace('\t', '    ') return txt def remove_indents(txt): ''' Remove whitespace at the beginning of each line. ''' return '\n'.join([l.lstrip() for l in txt.splitlines()]) def opf_writer(path, opf_name, manifest, spine, mi): opf = OPFCreator(path, mi) opf.create_manifest(manifest) opf.create_spine(spine) with open(os.path.join(path, opf_name), 'wb') as opffile: opf.render(opffile) def split_string_separator(txt, size): ''' Splits the text by putting \n\n at the point size. ''' if len(txt) > size and size > 2: size -= 2 txt = [] for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)): idx = part.rfind(b'.') if idx == -1: part += b'\n\n' else: part = part[:idx + 1] + b'\n\n' + part[idx:] txt.append(part) txt = b''.join(txt) return txt def detect_paragraph_type(txt): ''' Tries to determine the paragraph type of the document. block: Paragraphs are separated by a blank line. single: Each line is a paragraph. print: Each paragraph starts with a 2+ spaces or a tab and ends when a new paragraph is reached. unformatted: most lines have hard line breaks, few/no blank lines or indents returns block, single, print, unformatted ''' txt = txt.replace('\r\n', '\n') txt = txt.replace('\r', '\n') txt_line_count = len(re.findall(r'(?mu)^\s*.+$', txt)) # Check for hard line breaks - true if 55% of the doc breaks in the same region docanalysis = DocAnalysis('txt', txt) hardbreaks = docanalysis.line_histogram(.55) if hardbreaks: # Determine print percentage tab_line_count = len(re.findall(r'(?mu)^(\t|\s{2,}).+$', txt)) print_percent = tab_line_count / float(txt_line_count) # Determine block percentage empty_line_count = len(re.findall(r'(?mu)^\s*$', txt)) block_percent = empty_line_count / float(txt_line_count) # Compare the two types - the type with the larger number of instances wins # in cases where only one or the other represents the vast majority of the document neither wins if print_percent >= block_percent: if .15 <= print_percent <= .75: return 'print' elif .15 <= block_percent <= .75: return 'block' # Assume unformatted text with hardbreaks if nothing else matches return 'unformatted' # return single if hardbreaks is false return 'single' def detect_formatting_type(txt): ''' Tries to determine the formatting of the document. markdown: Markdown formatting is used. textile: Textile formatting is used. heuristic: When none of the above formatting types are detected heuristic is returned. ''' # Keep a count of the number of format specific object # that are found in the text. markdown_count = 0 textile_count = 0 # Check for markdown # Headings markdown_count += len(re.findall('(?mu)^#+', txt)) markdown_count += len(re.findall('(?mu)^=+$', txt)) markdown_count += len(re.findall('(?mu)^-+$', txt)) # Images markdown_count += len(re.findall(r'(?u)!\[.*?\](\[|\()', txt)) # Links markdown_count += len(re.findall(r'(?u)^|[^!]\[.*?\](\[|\()', txt)) # Check for textile # Headings textile_count += len(re.findall(r'(?mu)^h[1-6]\.', txt)) # Block quote. textile_count += len(re.findall(r'(?mu)^bq\.', txt)) # Images textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt)) # Links textile_count += len(re.findall(r'"[^"]*":\S+', txt)) # paragraph blocks textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt)) # Decide if either markdown or textile is used in the text # based on the number of unique formatting elements found. if markdown_count > 5 or textile_count > 5: if markdown_count > textile_count: return 'markdown' else: return 'textile' return 'heuristic'