mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-02 16:54:12 +01:00
287 lines
10 KiB
Python
287 lines
10 KiB
Python
from struct import unpack_from, calcsize, pack
|
|
from collections import OrderedDict
|
|
|
|
from ebook_converter.utils.fonts.utils import read_bmp_prefix
|
|
from ebook_converter.utils.fonts.sfnt import UnknownTable, max_power_of_two
|
|
from ebook_converter.utils.fonts.sfnt.errors import UnsupportedFont
|
|
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
# Note that the code for creating a BMP table (cmap format 4) is taken with
|
|
# thanks from the fonttools project (BSD licensed).
|
|
|
|
|
|
def split_range(start_code, end_code, cmap): # {{{
|
|
# Try to split a range of character codes into subranges with consecutive
|
|
# glyph IDs in such a way that the cmap4 subtable can be stored "most"
|
|
# efficiently.
|
|
if start_code == end_code:
|
|
return [], [end_code]
|
|
|
|
last_id = cmap[start_code]
|
|
last_code = start_code
|
|
in_order = None
|
|
ordered_begin = None
|
|
sub_ranges = []
|
|
|
|
# Gather subranges in which the glyph IDs are consecutive.
|
|
for code in range(start_code + 1, end_code + 1):
|
|
glyph_id = cmap[code]
|
|
|
|
if glyph_id - 1 == last_id:
|
|
if in_order is None or not in_order:
|
|
in_order = 1
|
|
ordered_begin = last_code
|
|
else:
|
|
if in_order:
|
|
in_order = 0
|
|
sub_ranges.append((ordered_begin, last_code))
|
|
ordered_begin = None
|
|
|
|
last_id = glyph_id
|
|
last_code = code
|
|
|
|
if in_order:
|
|
sub_ranges.append((ordered_begin, last_code))
|
|
assert last_code == end_code
|
|
|
|
# Now filter out those new subranges that would only make the data bigger.
|
|
# A new segment cost 8 bytes, not using a new segment costs 2 bytes per
|
|
# character.
|
|
new_ranges = []
|
|
for b, e in sub_ranges:
|
|
if b == start_code and e == end_code:
|
|
break # the whole range, we're fine
|
|
if b == start_code or e == end_code:
|
|
threshold = 4 # split costs one more segment
|
|
else:
|
|
threshold = 8 # split costs two more segments
|
|
if (e - b + 1) > threshold:
|
|
new_ranges.append((b, e))
|
|
sub_ranges = new_ranges
|
|
|
|
if not sub_ranges:
|
|
return [], [end_code]
|
|
|
|
if sub_ranges[0][0] != start_code:
|
|
sub_ranges.insert(0, (start_code, sub_ranges[0][0] - 1))
|
|
if sub_ranges[-1][1] != end_code:
|
|
sub_ranges.append((sub_ranges[-1][1] + 1, end_code))
|
|
|
|
# Fill the "holes" in the segments list -- those are the segments in which
|
|
# the glyph IDs are _not_ consecutive.
|
|
i = 1
|
|
while i < len(sub_ranges):
|
|
if sub_ranges[i-1][1] + 1 != sub_ranges[i][0]:
|
|
sub_ranges.insert(i, (sub_ranges[i-1][1] + 1, sub_ranges[i][0] - 1))
|
|
i = i + 1
|
|
i = i + 1
|
|
|
|
# Transform the ranges into start_code/end_code lists.
|
|
start = []
|
|
end = []
|
|
for b, e in sub_ranges:
|
|
start.append(b)
|
|
end.append(e)
|
|
start.pop(0)
|
|
|
|
assert len(start) + 1 == len(end)
|
|
return start, end
|
|
# }}}
|
|
|
|
|
|
def set_id_delta(id_delta): # {{{
|
|
# The lowest gid in glyphIndexArray, after subtracting id_delta, must be 1.
|
|
# id_delta is a short, and must be between -32K and 32K
|
|
# startCode can be between 0 and 64K-1, and the first glyph index can be between 1 and 64K-1
|
|
# This means that we have a problem because we can need to assign to
|
|
# id_delta values
|
|
# between -(64K-2) and 64K -1.
|
|
# Since the final gi is reconstructed from the glyphArray GID by:
|
|
# (short)finalGID = (gid + id_delta) % 0x10000),
|
|
# we can get from a startCode of 0 to a final GID of 64 -1K by subtracting 1, and casting the
|
|
# negative number to an unsigned short.
|
|
# Similarly , we can get from a startCode of 64K-1 to a final GID of 1 by adding 2, because of
|
|
# the modulo arithmetic.
|
|
|
|
if id_delta > 0x7FFF:
|
|
id_delta = id_delta - 0x10000
|
|
elif id_delta < -0x7FFF:
|
|
id_delta = id_delta + 0x10000
|
|
|
|
return id_delta
|
|
# }}}
|
|
|
|
|
|
class BMPTable(object):
|
|
|
|
def __init__(self, raw):
|
|
self.raw = raw
|
|
(self.start_count, self.end_count, self.range_offset, self.id_delta,
|
|
self.glyph_id_len, self.glyph_id_map, self.array_len) = \
|
|
read_bmp_prefix(raw, 0)
|
|
|
|
def get_glyph_ids(self, codes):
|
|
for code in codes:
|
|
found = False
|
|
for i, ec in enumerate(self.end_count):
|
|
if ec >= code:
|
|
sc = self.start_count[i]
|
|
if sc <= code:
|
|
found = True
|
|
ro = self.range_offset[i]
|
|
if ro == 0:
|
|
glyph_id = self.id_delta[i] + code
|
|
else:
|
|
idx = ro//2 + (code - sc) + i - self.array_len
|
|
glyph_id = self.glyph_id_map[idx]
|
|
if glyph_id != 0:
|
|
glyph_id += self.id_delta[i]
|
|
yield glyph_id % 0x10000
|
|
break
|
|
if not found:
|
|
yield 0
|
|
|
|
def get_glyph_map(self, glyph_ids):
|
|
ans = {}
|
|
for i, ec in enumerate(self.end_count):
|
|
sc = self.start_count[i]
|
|
for code in range(sc, ec+1):
|
|
ro = self.range_offset[i]
|
|
if ro == 0:
|
|
glyph_id = self.id_delta[i] + code
|
|
else:
|
|
idx = ro//2 + (code - sc) + i - self.array_len
|
|
glyph_id = self.glyph_id_map[idx]
|
|
if glyph_id != 0:
|
|
glyph_id += self.id_delta[i]
|
|
glyph_id %= 0x10000
|
|
if glyph_id in glyph_ids and code not in ans:
|
|
ans[code] = glyph_id
|
|
return ans
|
|
|
|
|
|
class CmapTable(UnknownTable):
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super(CmapTable, self).__init__(*args, **kwargs)
|
|
|
|
self.version, self.num_tables = unpack_from(b'>HH', self.raw)
|
|
|
|
self.tables = {}
|
|
|
|
offset = 4
|
|
sz = calcsize(b'>HHL')
|
|
recs = []
|
|
for i in range(self.num_tables):
|
|
platform, encoding, table_offset = unpack_from(b'>HHL', self.raw,
|
|
offset)
|
|
offset += sz
|
|
recs.append((platform, encoding, table_offset))
|
|
|
|
self.bmp_table = None
|
|
|
|
for i in range(len(recs)):
|
|
platform, encoding, offset = recs[i]
|
|
try:
|
|
next_offset = recs[i+1][-1]
|
|
except IndexError:
|
|
next_offset = len(self.raw)
|
|
table = self.raw[offset:next_offset]
|
|
if table:
|
|
fmt = unpack_from(b'>H', table)[0]
|
|
if platform == 3 and encoding == 1 and fmt == 4:
|
|
self.bmp_table = BMPTable(table)
|
|
|
|
def get_character_map(self, chars):
|
|
'''
|
|
Get a mapping of character codes to glyph ids in the font.
|
|
'''
|
|
if self.bmp_table is None:
|
|
raise UnsupportedFont('This font has no Windows BMP cmap subtable.'
|
|
' Most likely a special purpose font.')
|
|
chars = sorted(set(chars))
|
|
ans = OrderedDict()
|
|
for i, glyph_id in enumerate(self.bmp_table.get_glyph_ids(chars)):
|
|
if glyph_id > 0:
|
|
ans[chars[i]] = glyph_id
|
|
return ans
|
|
|
|
def get_glyph_map(self, glyph_ids):
|
|
'''
|
|
Get a mapping of character codes to glyph ids for the specified glyph
|
|
ids.
|
|
'''
|
|
if self.bmp_table is None:
|
|
raise UnsupportedFont('This font has no Windows BMP cmap subtable.'
|
|
' Most likely a special purpose font.')
|
|
glyph_ids = frozenset(glyph_ids)
|
|
return self.bmp_table.get_glyph_map(glyph_ids)
|
|
|
|
def set_character_map(self, cmap):
|
|
self.version, self.num_tables = 0, 1
|
|
fmt = b'>7H'
|
|
codes = sorted(cmap)
|
|
|
|
if not codes:
|
|
start_code = [0xffff]
|
|
end_code = [0xffff]
|
|
else:
|
|
last_code = codes[0]
|
|
end_code = []
|
|
start_code = [last_code]
|
|
|
|
for code in codes[1:]:
|
|
if code == last_code + 1:
|
|
last_code = code
|
|
continue
|
|
start, end = split_range(start_code[-1], last_code, cmap)
|
|
start_code.extend(start)
|
|
end_code.extend(end)
|
|
start_code.append(code)
|
|
last_code = code
|
|
end_code.append(last_code)
|
|
start_code.append(0xffff)
|
|
end_code.append(0xffff)
|
|
|
|
id_delta = []
|
|
id_range_offset = []
|
|
glyph_index_array = []
|
|
for i in range(len(end_code)-1): # skip the closing codes (0xffff)
|
|
indices = list(cmap[char_code] for char_code in range(start_code[i], end_code[i] + 1))
|
|
if indices == list(range(indices[0], indices[0] + len(indices))):
|
|
# indices is a contiguous list
|
|
id_delta_temp = set_id_delta(indices[0] - start_code[i])
|
|
id_delta.append(id_delta_temp)
|
|
id_range_offset.append(0)
|
|
else:
|
|
id_delta.append(0)
|
|
id_range_offset.append(2 * (len(end_code) + len(glyph_index_array) - i))
|
|
glyph_index_array.extend(indices)
|
|
id_delta.append(1) # 0xffff + 1 == 0. So this end code maps to .notdef
|
|
id_range_offset.append(0)
|
|
|
|
seg_count = len(end_code)
|
|
max_exponent = max_power_of_two(seg_count)
|
|
search_range = 2 * (2 ** max_exponent)
|
|
entry_selector = max_exponent
|
|
range_shift = 2 * seg_count - search_range
|
|
|
|
char_code_array = end_code + [0] + start_code
|
|
char_code_array = pack(b'>%dH'%len(char_code_array), *char_code_array)
|
|
id_delta_array = pack(b'>%dh'%len(id_delta), *id_delta)
|
|
rest_array = id_range_offset + glyph_index_array
|
|
rest_array = pack(b'>%dH'%len(rest_array), *rest_array)
|
|
data = char_code_array + id_delta_array + rest_array
|
|
|
|
length = calcsize(fmt) + len(data)
|
|
header = pack(fmt, 4, length, 0, 2*seg_count, search_range, entry_selector, range_shift)
|
|
self.bmp_table = header + data
|
|
|
|
fmt = b'>4HL'
|
|
offset = calcsize(fmt)
|
|
self.raw = pack(fmt, self.version, self.num_tables, 3, 1, offset) + self.bmp_table
|