哈夫曼采用整数进行符号编码的,使得其不能更好的逼近信息熵极限。 例如,如果 A 的出现频次是 0.5,B 的出现频次是 0.4,C 的出现频次是 0.1。那么我们应该期待 B 的编码长度接近于 A,而不是 C,但是事实上 B 的编码长度和 C 一样长,是 A 的编码的两倍长。因此,其达不到最佳的编码长度。
defbin(value, valueRange): result = list() valueUp = D(value+valueRange) value = D(value)
while1: value *= D("2") valueUp *= D("2") # 均大于 if value >= 1: value -= D('1') valueUp -= D('1') result.append(1) # 均小于 elif valueUp < 1: result.append(0) # low0up1 else: if valueUp == D('1'): result.append(0) result.append(1) break while1: i = result.pop() if i: result.append(i) break return result
defdec(value: list) -> decimal.Decimal: w = D('1') result = D('0') for i inrange(0, len(value)): w *= D('0.5') result += w*value[i] return result
defencode(): encode_str = input() low = [D('0')]
for i inrange(0, len(distribute)): low.append(D(distribute[i])+D(low[i])) nowRange = D('1') l = D('0') for i in encode_str: index = chars.index(i) l = l+nowRange*low[index] nowRange = nowRange*(low[index+1]-low[index]) returnbin(l, nowRange) # h = thedict.get(encode_str[i]) # print(h)
defdecode(codebin: list, codelenth): codedec = dec(codebin) result = str() low = [D('0')] for i inrange(0, len(distribute)): low.append(D(distribute[i])+D(low[i])) nowRange = D('1') l = D('0') st = D('0') for j inrange(0, codelenth): for i inrange(1, len(low)): st = (low[i])*nowRange+l # ed = st + nowRange if codedec < st: index = i-1 # index = chars.index(i) l = l+nowRange*low[index] nowRange = nowRange*(low[index+1]-low[index]) result += chars[index] break return result
if __name__ == '__main__': D = decimal.Decimal decimal.getcontext().prec = 32
chars = input("Dict:").split(',') # if default if chars[0] == "0": chars = ["A", "B", "C", "D"] distribute = [D('0.1'), D('0.4'), D('0.2'), D('0.3')] else: distribute = input("Distribute:").split(",") ans = encode()