mirror of
https://github.com/gryf/ebook-converter.git
synced 2025-12-28 04:02:27 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
110 lines
3.3 KiB
Python
110 lines
3.3 KiB
Python
#!/usr/bin/env python2
|
||
# -*- coding: utf-8 -*-
|
||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||
|
||
"""
|
||
Original Perl version by: John Gruber https://daringfireball.net/ 10 May 2008
|
||
Python version by Stuart Colville http://muffinresearch.co.uk
|
||
Modifications to make it work with non-ascii chars by Kovid Goyal
|
||
License: http://www.opensource.org/licenses/mit-license.php
|
||
"""
|
||
|
||
import re
|
||
|
||
from ebook_converter.utils.icu import capitalize, upper
|
||
from ebook_converter.polyglot.builtins import unicode_type
|
||
|
||
__all__ = ['titlecase']
|
||
__version__ = '0.5'
|
||
|
||
SMALL = 'a|an|and|as|at|but|by|en|for|if|in|of|on|or|the|to|v\\.?|via|vs\\.?'
|
||
PUNCT = r"""!"#$%&'‘’()*+,\-‒–—―./:;?@[\\\]_`{|}~"""
|
||
|
||
SMALL_WORDS = re.compile(r'^(%s)$' % SMALL, re.I)
|
||
INLINE_PERIOD = re.compile(r'[a-z][.][a-z]', re.I)
|
||
UC_ELSEWHERE = re.compile(r'[%s]*?[a-zA-Z]+[A-Z]+?' % PUNCT)
|
||
CAPFIRST = re.compile(unicode_type(r"^[%s]*?(\w)" % PUNCT), flags=re.UNICODE)
|
||
SMALL_FIRST = re.compile(r'^([%s]*)(%s)\b' % (PUNCT, SMALL), re.I|re.U)
|
||
SMALL_LAST = re.compile(r'\b(%s)[%s]?$' % (SMALL, PUNCT), re.I|re.U)
|
||
SMALL_AFTER_NUM = re.compile(r'(\d+\s+)(a|an|the)\b', re.I|re.U)
|
||
SUBPHRASE = re.compile(r'([:.;?!][ ])(%s)' % SMALL)
|
||
APOS_SECOND = re.compile(r"^[dol]{1}['‘]{1}[a-z]+$", re.I)
|
||
UC_INITIALS = re.compile(r"^(?:[A-Z]{1}\.{1}|[A-Z]{1}\.{1}[A-Z]{1})+$")
|
||
|
||
_lang = None
|
||
|
||
|
||
def lang():
|
||
global _lang
|
||
if _lang is None:
|
||
from ebook_converter.utils.localization import get_lang
|
||
_lang = get_lang().lower()
|
||
return _lang
|
||
|
||
|
||
def titlecase(text):
|
||
"""
|
||
Titlecases input text
|
||
|
||
This filter changes all words to Title Caps, and attempts to be clever
|
||
about *un*capitalizing SMALL words like a/an/the in the input.
|
||
|
||
The list of "SMALL words" which are not capped comes from
|
||
the New York Times Manual of Style, plus 'vs' and 'v'.
|
||
|
||
"""
|
||
|
||
all_caps = upper(text) == text
|
||
|
||
pat = re.compile(r'(\s+)')
|
||
line = []
|
||
for word in pat.split(text):
|
||
if not word:
|
||
continue
|
||
if pat.match(word) is not None:
|
||
line.append(word)
|
||
continue
|
||
if all_caps:
|
||
if UC_INITIALS.match(word):
|
||
line.append(word)
|
||
continue
|
||
else:
|
||
word = icu_lower(word)
|
||
|
||
if APOS_SECOND.match(word):
|
||
word = word.replace(word[0], icu_upper(word[0]), 1)
|
||
word = word[:2] + icu_upper(word[2]) + word[3:]
|
||
line.append(word)
|
||
continue
|
||
if INLINE_PERIOD.search(word) or UC_ELSEWHERE.match(word):
|
||
line.append(word)
|
||
continue
|
||
if SMALL_WORDS.match(word):
|
||
line.append(icu_lower(word))
|
||
continue
|
||
|
||
hyphenated = []
|
||
for item in word.split('-'):
|
||
hyphenated.append(CAPFIRST.sub(lambda m: icu_upper(m.group(0)), item))
|
||
line.append("-".join(hyphenated))
|
||
|
||
result = "".join(line)
|
||
|
||
result = SMALL_FIRST.sub(lambda m: '%s%s' % (
|
||
m.group(1),
|
||
capitalize(m.group(2))
|
||
), result)
|
||
|
||
result = SMALL_AFTER_NUM.sub(lambda m: '%s%s' % (m.group(1),
|
||
capitalize(m.group(2))
|
||
), result)
|
||
|
||
result = SMALL_LAST.sub(lambda m: capitalize(m.group(0)), result)
|
||
|
||
result = SUBPHRASE.sub(lambda m: '%s%s' % (
|
||
m.group(1),
|
||
capitalize(m.group(2))
|
||
), result)
|
||
|
||
return result
|