mirror of
https://github.com/gryf/ebook-converter.git
synced 2025-12-28 20:42:27 +01:00
96 lines
2.4 KiB
Python
96 lines
2.4 KiB
Python
#!/usr/bin/python2
|
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
"""
|
|
Get word, character, and Asian character counts
|
|
|
|
1. Get a word count as a dictionary:
|
|
wc = get_wordcount(text)
|
|
words = wc['words'] # etc.
|
|
|
|
2. Get a word count as an object
|
|
wc = get_wordcount_obj(text)
|
|
words = wc.words # etc.
|
|
|
|
properties counted:
|
|
* characters
|
|
* chars_no_spaces
|
|
* asian_chars
|
|
* non_asian_words
|
|
* words
|
|
|
|
Sourced from:
|
|
http://ginstrom.com/scribbles/2008/05/17/counting-words-etc-in-an-html-file-with-python/
|
|
http://ginstrom.com/scribbles/2007/10/06/counting-words-characters-and-asian-characters-with-python/
|
|
"""
|
|
__version__ = 0.1
|
|
__author__ = "Ryan Ginstrom"
|
|
|
|
IDEOGRAPHIC_SPACE = 0x3000
|
|
|
|
|
|
def is_asian(char):
|
|
"""Is the character Asian?"""
|
|
|
|
# 0x3000 is ideographic space (i.e. double-byte space)
|
|
# Anything over is an Asian character
|
|
return ord(char) > IDEOGRAPHIC_SPACE
|
|
|
|
|
|
def filter_jchars(c):
|
|
"""Filters Asian characters to spaces"""
|
|
if is_asian(c):
|
|
return ' '
|
|
return c
|
|
|
|
|
|
def nonj_len(word):
|
|
"""Returns number of non-Asian words in {word}
|
|
- 日本語AアジアンB -> 2
|
|
- hello -> 1
|
|
@param word: A word, possibly containing Asian characters
|
|
"""
|
|
# Here are the steps:
|
|
# 本spam日eggs
|
|
# -> [' ', 's', 'p', 'a', 'm', ' ', 'e', 'g', 'g', 's']
|
|
# -> ' spam eggs'
|
|
# -> ['spam', 'eggs']
|
|
# The length of which is 2!
|
|
chars = [filter_jchars(c) for c in word]
|
|
return len(''.join(chars).split())
|
|
|
|
|
|
def get_wordcount(text):
|
|
"""Get the word/character count for text
|
|
|
|
@param text: The text of the segment
|
|
"""
|
|
|
|
characters = len(text)
|
|
chars_no_spaces = sum(not x.isspace() for x in text)
|
|
asian_chars = sum(is_asian(x) for x in text)
|
|
non_asian_words = nonj_len(text)
|
|
words = non_asian_words + asian_chars
|
|
|
|
return dict(characters=characters,
|
|
chars_no_spaces=chars_no_spaces,
|
|
asian_chars=asian_chars,
|
|
non_asian_words=non_asian_words,
|
|
words=words)
|
|
|
|
|
|
def dict2obj(dictionary):
|
|
"""Transform a dictionary into an object"""
|
|
class Obj(object):
|
|
|
|
def __init__(self, dictionary):
|
|
self.__dict__.update(dictionary)
|
|
return Obj(dictionary)
|
|
|
|
|
|
def get_wordcount_obj(text):
|
|
"""Get the wordcount as an object rather than a dictionary"""
|
|
return dict2obj(get_wordcount(text))
|