Initial import

2026-03-29 08:03:31 +02:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
--- a/ebook_converter/utils/wordcount.py
+++ b/ebook_converter/utils/wordcount.py
@@ -0,0 +1,95 @@
+#!/usr/bin/python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+"""
+Get word, character, and Asian character counts
+
+1. Get a word count as a dictionary:
+    wc = get_wordcount(text)
+    words = wc['words'] # etc.
+
+2. Get a word count as an object
+    wc = get_wordcount_obj(text)
+    words = wc.words # etc.
+
+properties counted:
+    * characters
+    * chars_no_spaces
+    * asian_chars
+    * non_asian_words
+    * words
+
+Sourced from:
+http://ginstrom.com/scribbles/2008/05/17/counting-words-etc-in-an-html-file-with-python/
+http://ginstrom.com/scribbles/2007/10/06/counting-words-characters-and-asian-characters-with-python/
+"""
+__version__ = 0.1
+__author__ = "Ryan Ginstrom"
+
+IDEOGRAPHIC_SPACE = 0x3000
+
+
+def is_asian(char):
+    """Is the character Asian?"""
+
+    # 0x3000 is ideographic space (i.e. double-byte space)
+    # Anything over is an Asian character
+    return ord(char) > IDEOGRAPHIC_SPACE
+
+
+def filter_jchars(c):
+    """Filters Asian characters to spaces"""
+    if is_asian(c):
+        return ' '
+    return c
+
+
+def nonj_len(word):
+    """Returns number of non-Asian words in {word}
+    - 日本語AアジアンB -> 2
+    - hello -> 1
+    @param word: A word, possibly containing Asian characters
+    """
+    # Here are the steps:
+    # 本spam日eggs
+    # -> [' ', 's', 'p', 'a', 'm', ' ', 'e', 'g', 'g', 's']
+    # -> ' spam eggs'
+    # -> ['spam', 'eggs']
+    # The length of which is 2!
+    chars = [filter_jchars(c) for c in word]
+    return len(''.join(chars).split())
+
+
+def get_wordcount(text):
+    """Get the word/character count for text
+
+    @param text: The text of the segment
+    """
+
+    characters = len(text)
+    chars_no_spaces = sum(not x.isspace() for x in text)
+    asian_chars =  sum(is_asian(x) for x in text)
+    non_asian_words = nonj_len(text)
+    words = non_asian_words + asian_chars
+
+    return dict(characters=characters,
+                chars_no_spaces=chars_no_spaces,
+                asian_chars=asian_chars,
+                non_asian_words=non_asian_words,
+                words=words)
+
+
+def dict2obj(dictionary):
+    """Transform a dictionary into an object"""
+    class Obj(object):
+
+        def __init__(self, dictionary):
+            self.__dict__.update(dictionary)
+    return Obj(dictionary)
+
+
+def get_wordcount_obj(text):
+    """Get the wordcount as an object rather than a dictionary"""
+    return dict2obj(get_wordcount(text))