unicode编码范围
unicode_dic = {
'\\u0000-\\u007f': {'start_code_10': '0', 'end_code_10': '127', 'start_code_16': '0000', 'end_code_16': '007F',
'char_count': '128', 'code_type_zh': 'C0控制符及基本拉丁文',
'code_type_en': 'C0 Control and Basic Latin'},
'\\u0080-\\u00ff': {'start_code_10': '128', 'end_code_10': '255', 'start_code_16': '0080', 'end_code_16': '00FF',
'char_count': '128', 'code_type_zh': 'C1控制符及拉丁文补充-1',
'code_type_en': 'C1 Control and Latin 1 Supplement'},
'\\u0100-\\u017f': {'start_code_10': '256', 'end_code_10': '383', 'start_code_16': '0100', 'end_code_16': '017F',
'char_count': '128', 'code_type_zh': '拉丁文扩展-A', 'code_type_en': 'Latin Extended-A'},
'\\u0180-\\u024f': {'start_code_10': '384', 'end_code_10': '591', 'start_code_16': '0180', 'end_code_16': '024F',
'char_count': '208', 'code_type_zh': '拉丁文扩展-B', 'code_type_en': 'Latin Extended-B'},
'\\u0250-\\u02af': {'start_code_10': '592', 'end_code_10': '687', 'start_code_16': '0250', 'end_code_16': '02AF',
'char_count': '96', 'code_type_zh': '国际音标扩展', 'code_type_en': 'IPA Extensions'},
'\\u02b0-\\u02ff': {'start_code_10': '688', 'end_code_10': '767', 'start_code_16': '02B0', 'end_code_16': '02FF',
'char_count': '80', 'code_type_zh': '空白修饰字母', 'code_type_en': 'Spacing Modifiers'},
'\\u0300-\\u036f': {'start_code_10': '768', 'end_code_10': '879', 'start_code_16': '0300', 'end_code_16': '036F',
'char_count': '112', 'code_type_zh': '结合用读音符号', 'code_type_en': 'Combining Diacritics Marks'},
'\\u0370-\\u03ff': {'start_code_10': '880', 'end_code_10': '1023', 'start_code_16': '0370', 'end_code_16': '03FF',
'char_count': '144', 'code_type_zh': '希腊文及科普特文', 'code_type_en': 'Greek and Coptic'},
'\\u0400-\\u04ff': {'start_code_10': '1024', 'end_code_10': '1279', 'start_code_16': '0400', 'end_code_16': '04FF',
'char_count': '256', 'code_type_zh': '西里尔字母', 'code_type_en': 'Cyrillic'},
'\\u0500-\\u052f': {'start_code_10': '1280', 'end_code_10': '1327', 'start_code_16': '0500', 'end_code_16': '052F',
'char_count': '48', 'code_type_zh': '西里尔字母补充', 'code_type_en': 'Cyrillic Supplement'},
'\\u0530-\\u058f': {'start_code_10': '1328', 'end_code_10': '1423', 'start_code_16': '0530', 'end_code_16': '058F',
'char_count': '96', 'code_type_zh': '亚美尼亚语', 'code_type_en': 'Armenian'},
'\\u0590-\\u05ff': {'start_code_10': '1424', 'end_code_10': '1535', 'start_code_16': '0590', 'end_code_16': '05FF',
'char_count': '112', 'code_type_zh': '希伯来文', 'code_type_en': 'Hebrew'},
'\\u0600-\\u06ff': {'start_code_10': '1536', 'end_code_10': '1791', 'start_code_16': '0600', 'end_code_16': '06FF',
'char_count': '256', 'code_type_zh': '阿拉伯文', 'code_type_en': 'Arabic'},
'\\u0700-\\u074f': {'start_code_10': '1792', 'end_code_10': '1871', 'start_code_16': '0700', 'end_code_16': '074F',
'char_count': '80', 'code_type_zh': '叙利亚文', 'code_type_en': 'Syriac'},
'\\u0750-\\u077f': {'start_code_10': '1872', 'end_code_10': '1919', 'start_code_16': '0750', 'end_code_16': '077F',
'char_count': '48', 'code_type_zh': '阿拉伯文补充', 'code_type_en': 'Arabic Supplement'},
'\\u0780-\\u07bf': {'start_code_10': '1920', 'end_code_10': '1983', 'start_code_16': '0780', 'end_code_16': '07BF',
'char_count': '64', 'code_type_zh': '马尔代夫语', 'code_type_en': 'Thaana'},
'\\u07c0-\\u07ff': {'start_code_10': '1984', 'end_code_10': '2047', 'start_code_16': '07C0', 'end_code_16': '07FF',
'char_count': '64', 'code_type_zh': '西非書面語言', 'code_type_en': "N'Ko"},
'\\u0800-\\u085f': {'start_code_10': '2048', 'end_code_10': '2143', 'start_code_16': '0800', 'end_code_16': '085F',
'char_count': '96', 'code_type_zh': '阿维斯塔语及巴列维语', 'code_type_en': 'Avestan and Pahlavi'},
'\\u0860-\\u087f': {'start_code_10': '2144', 'end_code_10': '2175', 'start_code_16': '0860', 'end_code_16': '087F',
'char_count': '32', 'code_type_zh': 'Mandaic', 'code_type_en': 'Mandaic'},
'\\u0880-\\u08af': {'start_code_10': '2176', 'end_code_10': '2223', 'start_code_16': '0880', 'end_code_16': '08AF',
'char_count': '48', 'code_type_zh': '撒马利亚语', 'code_type_en': 'Samaritan'},
'\\u0900-\\u097f': {'start_code_10': '2304', 'end_code_10': '2431', 'start_code_16': '0900', 'end_code_16': '097F',
'char_count': '128', 'code_type_zh': '天城文书', 'code_type_en': 'Devanagari'},
'\\u0980-\\u09ff': {'start_code_10': '2432', 'end_code_10': '2559', 'start_code_16': '0980', 'end_code_16': '09FF',
'char_count': '128', 'code_type_zh': '孟加拉语', 'code_type_en': 'Bengali'},
'\\u0a00-\\u0a7f': {'start_code_10': '2560', 'end_code_10': '2687', 'start_code_16': '0A00', 'end_code_16': '0A7F',
'char_count': '128', 'code_type_zh': '锡克教文', 'code_type_en': 'Gurmukhi'},
'\\u0a80-\\u0aff': {'start_code_10': '2688', 'end_code_10': '2815', 'start_code_16': '0A80', 'end_code_16': '0AFF',
'char_count': '128', 'code_type_zh': '古吉拉特文', 'code_type_en': 'Gujarati'},
'\\u0b00-\\u0b7f': {'start_code_10': '2816', 'end_code_10': '2943', 'start_code_16': '0B00', 'end_code_16': '0B7F',
'char_count': '128', 'code_type_zh': '奥里亚文', 'code_type_en': 'Oriya'},
'\\u0b80-\\u0bff': {'start_code_10': '2944', 'end_code_10': '3071', 'start_code_16': '0B80', 'end_code_16': '0BFF',
'char_count': '128', 'code_type_zh': '泰米尔文', 'code_type_en': 'Tamil'},
'\\u0c00-\\u0c7f': {'start_code_10': '3072', 'end_code_10': '3199', 'start_code_16': '0C00', 'end_code_16': '0C7F',
'char_count': '128', 'code_type_zh': '泰卢固文', 'code_type_en': 'Telugu'},
'\\u0c80-\\u0cff': {'start_code_10': '3200', 'end_code_10': '3327', 'start_code_16': '0C80', 'end_code_16': '0CFF',
'char_count': '128', 'code_type_zh': '卡纳达文', 'code_type_en': 'Kannada'},
'\\u0d00-\\u0d7f': {'start_code_10': '3328', 'end_code_10': '3455', 'start_code_16': '0D00', 'end_code_16': '0D7F',
'char_count': '128', 'code_type_zh': '德拉维族语', 'code_type_en': 'Malayalam'},
'\\u0d80-\\u0dff': {'start_code_10': '3456', 'end_code_10': '3583', 'start_code_16': '0D80', 'end_code_16': '0DFF',
'char_count': '128', 'code_type_zh': '僧伽罗语', 'code_type_en': 'Sinhala'},
'\\u0e00-\\u0e7f': {'start_code_10': '3584', 'end_code_10': '3711', 'start_code_16': '0E00', 'end_code_16': '0E7F',
'char_count': '128', 'code_type_zh': '泰文', 'code_type_en': 'Thai'},
'\\u0e80-\\u0eff': {'start_code_10': '3712', 'end_code_10': '3839', 'start_code_16': '0E80', 'end_code_16': '0EFF',
'char_count': '128', 'code_type_zh': '老挝文', 'code_type_en': 'Lao'},
'\\u0f00-\\u0fff': {'start_code_10': '3840', 'end_code_10': '4095', 'start_code_16': '0F00', 'end_code_16': '0FFF',
'char_count': '256', 'code_type_zh': '藏文', 'code_type_en': 'Tibetan'},
'\\u1000-\\u109f': {'start_code_10': '4096', 'end_code_10': '4255', 'start_code_16': '1000', 'end_code_16': '109F',
'char_count': '160', 'code_type_zh': '缅甸语', 'code_type_en': 'Myanmar'},
'\\u10a0-\\u10ff': {'start_code_10': '4256', 'end_code_10': '4351', 'start_code_16': '10A0', 'end_code_16': '10FF',
'char_count': '96', 'code_type_zh': '格鲁吉亚语', 'code_type_en': 'Georgian'},
'\\u1100-\\u11ff': {'start_code_10': '4352', 'end_code_10': '4607', 'start_code_16': '1100', 'end_code_16': '11FF',
'char_count': '256', 'code_type_zh': '朝鲜文', 'code_type_en': 'Hangul Jamo'},
'\\u1200-\\u137f': {'start_code_10': '4608', 'end_code_10': '4991', 'start_code_16': '1200', 'end_code_16': '137F',
'char_count': '384', 'code_type_zh': '埃塞俄比亚语', 'code_type_en': 'Ethiopic'},
'\\u1380-\\u139f': {'start_code_10': '4992', 'end_code_10': '5023', 'start_code_16': '1380', 'end_code_16': '139F',
'char_count': '32', 'code_type_zh': '埃塞俄比亚语补充', 'code_type_en': 'Ethiopic Supplement'},
'\\u13a0-\\u13ff': {'start_code_10': '5024', 'end_code_10': '5119', 'start_code_16': '13A0', 'end_code_16': '13FF',
'char_count': '96', 'code_type_zh': '切罗基语', 'code_type_en': 'Cherokee'},
'\\u1400-\\u167f': {'start_code_10': '5120', 'end_code_10': '5759', 'start_code_16': '1400', 'end_code_16': '167F',
'char_count': '640', 'code_type_zh': '统一加拿大土著语音节',
'code_type_en': 'Unified Canadian Aboriginal Syllabics'},
'\\u1680-\\u169f': {'start_code_10': '5760', 'end_code_10': '5791', 'start_code_16': '1680', 'end_code_16': '169F',
'char_count': '32', 'code_type_zh': '欧甘字母', 'code_type_en': 'Ogham'},
'\\u16a0-\\u16ff': {'start_code_10': '5792', 'end_code_10': '5887', 'start_code_16': '16A0', 'end_code_16': '16FF',
'char_count': '96', 'code_type_zh': '如尼文', 'code_type_en': 'Runic'},
'\\u1700-\\u171f': {'start_code_10': '5888', 'end_code_10': '5919', 'start_code_16': '1700', 'end_code_16': '171F',
'char_count': '32', 'code_type_zh': '塔加拉语', 'code_type_en': 'Tagalog'},
'\\u1720-\\u173f': {'start_code_10': '5920', 'end_code_10': '5951', 'start_code_16': '1720', 'end_code_16': '173F',
'char_count': '32', 'code_type_zh': 'Hanunóo', 'code_type_en': 'Hanunóo'},
'\\u1740-\\u175f': {'start_code_10': '5952', 'end_code_10': '5983', 'start_code_16': '1740', 'end_code_16': '175F',
'char_count': '32', 'code_type_zh': 'Buhid', 'code_type_en': 'Buhid'},
'\\u1760-\\u177f': {'start_code_10': '5984', 'end_code_10': '6015', 'start_code_16': '1760', 'end_code_16': '177F',
'char_count': '32', 'code_type_zh': 'Tagbanwa', 'code_type_en': 'Tagbanwa'},
'\\u1780-\\u17ff': {'start_code_10': '6016', 'end_code_10': '6143', 'start_code_16': '1780', 'end_code_16': '17FF',
'char_count': '128', 'code_type_zh': '高棉语', 'code_type_en': 'Khmer'},
'\\u1800-\\u18af': {'start_code_10': '6144', 'end_code_10': '6319', 'start_code_16': '1800', 'end_code_16': '18AF',
'char_count': '176', 'code_type_zh': '蒙古文', 'code_type_en': 'Mongolian'},
'\\u18b0-\\u18ff': {'start_code_10': '6320', 'end_code_10': '6399', 'start_code_16': '18B0', 'end_code_16': '18FF',
'char_count': '80', 'code_type_zh': 'Cham', 'code_type_en': 'Cham'},
'\\u1900-\\u194f': {'start_code_10': '6400', 'end_code_10': '6479', 'start_code_16': '1900', 'end_code_16': '194F',
'char_count': '80', 'code_type_zh': 'Limbu', 'code_type_en': 'Limbu'},
'\\u1950-\\u197f': {'start_code_10': '6480', 'end_code_10': '6527', 'start_code_16': '1950', 'end_code_16': '197F',
'char_count': '48', 'code_type_zh': '德宏泰语', 'code_type_en': 'Tai Le'},
'\\u1980-\\u19df': {'start_code_10': '6528', 'end_code_10': '6623', 'start_code_16': '1980', 'end_code_16': '19DF',
'char_count': '96', 'code_type_zh': '新傣仂语', 'code_type_en': 'New Tai Lue'},
'\\u19e0-\\u19ff': {'start_code_10': '6624', 'end_code_10': '6655', 'start_code_16': '19E0', 'end_code_16': '19FF',
'char_count': '32', 'code_type_zh': '高棉语记号', 'code_type_en': 'Kmer Symbols'},
'\\u1a00-\\u1a1f': {'start_code_10': '6656', 'end_code_10': '6687', 'start_code_16': '1A00', 'end_code_16': '1A1F',
'char_count': '32', 'code_type_zh': 'Buginese', 'code_type_en': 'Buginese'},
'\\u1a20-\\u1a5f': {'start_code_10': '6688', 'end_code_10': '6751', 'start_code_16': '1A20', 'end_code_16': '1A5F',
'char_count': '64', 'code_type_zh': 'Batak', 'code_type_en': 'Batak'},
'\\u1a80-\\u1aef': {'start_code_10': '6784', 'end_code_10': '6895', 'start_code_16': '1A80', 'end_code_16': '1AEF',
'char_count': '112', 'code_type_zh': 'Lanna', 'code_type_en': 'Lanna'},
'\\u1b00-\\u1b7f': {'start_code_10': '6912', 'end_code_10': '7039', 'start_code_16': '1B00', 'end_code_16': '1B7F',
'char_count': '128', 'code_type_zh': '巴厘语', 'code_type_en': 'Balinese'},
'\\u1b80-\\u1bb0': {'start_code_10': '7040', 'end_code_10': '7088', 'start_code_16': '1B80', 'end_code_16': '1BB0',
'char_count': '49', 'code_type_zh': '巽他语', 'code_type_en': 'Sundanese'},
'\\u1bc0-\\u1bff': {'start_code_10': '7104', 'end_code_10': '7167', 'start_code_16': '1BC0', 'end_code_16': '1BFF',
'char_count': '64', 'code_type_zh': 'Pahawh Hmong', 'code_type_en': 'Pahawh Hmong'},
'\\u1c00-\\u1c4f': {'start_code_10': '7168', 'end_code_10': '7247', 'start_code_16': '1C00', 'end_code_16': '1C4F',
'char_count': '80', 'code_type_zh': '雷布查语', 'code_type_en': 'Lepcha'},
'\\u1c50-\\u1c7f': {'start_code_10': '7248', 'end_code_10': '7295', 'start_code_16': '1C50', 'end_code_16': '1C7F',
'char_count': '48', 'code_type_zh': 'Ol Chiki', 'code_type_en': 'Ol Chiki'},
'\\u1c80-\\u1cdf': {'start_code_10': '7296', 'end_code_10': '7391', 'start_code_16': '1C80', 'end_code_16': '1CDF',
'char_count': '96', 'code_type_zh': '曼尼普尔语', 'code_type_en': 'Meithei/Manipuri'},
'\\u1d00-\\u1d7f': {'start_code_10': '7424', 'end_code_10': '7551', 'start_code_16': '1D00', 'end_code_16': '1D7F',
'char_count': '128', 'code_type_zh': '语音学扩展', 'code_type_en': 'Phonetic Extensions'},
'\\u1d80-\\u1dbf': {'start_code_10': '7552', 'end_code_10': '7615', 'start_code_16': '1D80', 'end_code_16': '1DBF',
'char_count': '64', 'code_type_zh': '语音学扩展补充',
'code_type_en': 'Phonetic Extensions Supplement'},
'\\u1dc0-\\u1dff': {'start_code_10': '7616', 'end_code_10': '7679', 'start_code_16': '1DC0', 'end_code_16': '1DFF',
'char_count': '64', 'code_type_zh': '结合用读音符号补充',
'code_type_en': 'Combining Diacritics Marks Supplement'},
'\\u1e00-\\u1eff': {'start_code_10': '7680', 'end_code_10': '7935', 'start_code_16': '1E00', 'end_code_16': '1EFF',
'char_count': '256', 'code_type_zh': '拉丁文扩充附加', 'code_type_en': 'Latin Extended Additional'},
'\\u1f00-\\u1fff': {'start_code_10': '7936', 'end_code_10': '8191', 'start_code_16': '1F00', 'end_code_16': '1FFF',
'char_count': '256', 'code_type_zh': '希腊语扩充', 'code_type_en': 'Greek Extended'},
'\\u2000-\\u206f': {'start_code_10': '8192', 'end_code_10': '8303', 'start_code_16': '2000', 'end_code_16': '206F',
'char_count': '112', 'code_type_zh': '常用标点', 'code_type_en': 'General Punctuation'},
'\\u2070-\\u209f': {'start_code_10': '8304', 'end_code_10': '8351', 'start_code_16': '2070', 'end_code_16': '209F',
'char_count': '48', 'code_type_zh': '上标及下标', 'code_type_en': 'Superscripts and Subscripts'},
'\\u20a0-\\u20cf': {'start_code_10': '8352', 'end_code_10': '8399', 'start_code_16': '20A0', 'end_code_16': '20CF',
'char_count': '48', 'code_type_zh': '货币符号', 'code_type_en': 'Currency Symbols'},
'\\u20d0-\\u20ff': {'start_code_10': '8400', 'end_code_10': '8447', 'start_code_16': '20D0', 'end_code_16': '20FF',
'char_count': '48', 'code_type_zh': '组合用记号',
'code_type_en': 'Combining Diacritics Marks for Symbols'},
'\\u2100-\\u214f': {'start_code_10': '8448', 'end_code_10': '8527', 'start_code_16': '2100', 'end_code_16': '214F',
'char_count': '80', 'code_type_zh': '字母式符号', 'code_type_en': 'Letterlike Symbols'},
'\\u2150-\\u218f': {'start_code_10': '8528', 'end_code_10': '8591', 'start_code_16': '2150', 'end_code_16': '218F',
'char_count': '64', 'code_type_zh': '数字形式', 'code_type_en': 'Number Form'},
'\\u2190-\\u21ff': {'start_code_10': '8592', 'end_code_10': '8703', 'start_code_16': '2190', 'end_code_16': '21FF',
'char_count': '112', 'code_type_zh': '箭头', 'code_type_en': 'Arrows'},
'\\u2200-\\u22ff': {'start_code_10': '8704', 'end_code_10': '8959', 'start_code_16': '2200', 'end_code_16': '22FF',
'char_count': '256', 'code_type_zh': '数学运算符', 'code_type_en': 'Mathematical Operator'},
'\\u2300-\\u23ff': {'start_code_10': '8960', 'end_code_10': '9215', 'start_code_16': '2300', 'end_code_16': '23FF',
'char_count': '256', 'code_type_zh': '杂项工业符号', 'code_type_en': 'Miscellaneous Technical'},
'\\u2400-\\u243f': {'start_code_10': '9216', 'end_code_10': '9279', 'start_code_16': '2400', 'end_code_16': '243F',
'char_count': '64', 'code_type_zh': '控制图片', 'code_type_en': 'Control Pictures'},
'\\u2440-\\u245f': {'start_code_10': '9280', 'end_code_10': '9311', 'start_code_16': '2440', 'end_code_16': '245F',
'char_count': '32', 'code_type_zh': '光学识别符', 'code_type_en': 'Optical Character Recognition'},
'\\u2460-\\u24ff': {'start_code_10': '9312', 'end_code_10': '9471', 'start_code_16': '2460', 'end_code_16': '24FF',
'char_count': '160', 'code_type_zh': '封闭式字母数字', 'code_type_en': 'Enclosed Alphanumerics'},
'\\u2500-\\u257f': {'start_code_10': '9472', 'end_code_10': '9599', 'start_code_16': '2500', 'end_code_16': '257F',
'char_count': '128', 'code_type_zh': '制表符', 'code_type_en': 'Box Drawing'},
'\\u2580-\\u259f': {'start_code_10': '9600', 'end_code_10': '9631', 'start_code_16': '2580', 'end_code_16': '259F',
'char_count': '32', 'code_type_zh': '方块元素', 'code_type_en': 'Block Element'},
'\\u25a0-\\u25ff': {'start_code_10': '9632', 'end_code_10': '9727', 'start_code_16': '25A0', 'end_code_16': '25FF',
'char_count': '96', 'code_type_zh': '几何图形', 'code_type_en': 'Geometric Shapes'},
'\\u2600-\\u26ff': {'start_code_10': '9728', 'end_code_10': '9983', 'start_code_16': '2600', 'end_code_16': '26FF',
'char_count': '256', 'code_type_zh': '杂项符号', 'code_type_en': 'Miscellaneous Symbols'},
'\\u2700-\\u27bf': {'start_code_10': '9984', 'end_code_10': '10175', 'start_code_16': '2700', 'end_code_16': '27BF',
'char_count': '192', 'code_type_zh': '印刷符号', 'code_type_en': 'Dingbats'},
'\\u27c0-\\u27ef': {'start_code_10': '10176', 'end_code_10': '10223', 'start_code_16': '27C0',
'end_code_16': '27EF', 'char_count': '48', 'code_type_zh': '杂项数学符号-A',
'code_type_en': 'Miscellaneous Mathematical Symbols-A'},
'\\u27f0-\\u27ff': {'start_code_10': '10224', 'end_code_10': '10239', 'start_code_16': '27F0',
'end_code_16': '27FF', 'char_count': '16', 'code_type_zh': '追加箭头-A',
'code_type_en': 'Supplemental Arrows-A'},
'\\u2800-\\u28ff': {'start_code_10': '10240', 'end_code_10': '10495', 'start_code_16': '2800',
'end_code_16': '28FF', 'char_count': '256', 'code_type_zh': '盲文点字模型',
'code_type_en': 'Braille Patterns'},
'\\u2900-\\u297f': {'start_code_10': '10496', 'end_code_10': '10623', 'start_code_16': '2900',
'end_code_16': '297F', 'char_count': '128', 'code_type_zh': '追加箭头-B',
'code_type_en': 'Supplemental Arrows-B'},
'\\u2980-\\u29ff': {'start_code_10': '10624', 'end_code_10': '10751', 'start_code_16': '2980',
'end_code_16': '29FF', 'char_count': '128', 'code_type_zh': '杂项数学符号-B',
'code_type_en': 'Miscellaneous Mathematical Symbols-B'},
'\\u2a00-\\u2aff': {'start_code_10': '10752', 'end_code_10': '11007', 'start_code_16': '2A00',
'end_code_16': '2AFF', 'char_count': '256', 'code_type_zh': '追加数学运算符',
'code_type_en': 'Supplemental Mathematical Operator'},
'\\u2b00-\\u2bff': {'start_code_10': '11008', 'end_code_10': '11263', 'start_code_16': '2B00',
'end_code_16': '2BFF', 'char_count': '256', 'code_type_zh': '杂项符号和箭头',
'code_type_en': 'Miscellaneous Symbols and Arrows'},
'\\u2c00-\\u2c5f': {'start_code_10': '11264', 'end_code_10': '11359', 'start_code_16': '2C00',
'end_code_16': '2C5F', 'char_count': '96', 'code_type_zh': '格拉哥里字母',
'code_type_en': 'Glagolitic'},
'\\u2c60-\\u2c7f': {'start_code_10': '11360', 'end_code_10': '11391', 'start_code_16': '2C60',
'end_code_16': '2C7F', 'char_count': '32', 'code_type_zh': '拉丁文扩展-C',
'code_type_en': 'Latin Extended-C'},
'\\u2c80-\\u2cff': {'start_code_10': '11392', 'end_code_10': '11519', 'start_code_16': '2C80',
'end_code_16': '2CFF', 'char_count': '128', 'code_type_zh': '古埃及语', 'code_type_en': 'Coptic'},
'\\u2d00-\\u2d2f': {'start_code_10': '11520', 'end_code_10': '11567', 'start_code_16': '2D00',
'end_code_16': '2D2F', 'char_count': '48', 'code_type_zh': '格鲁吉亚语补充',
'code_type_en': 'Georgian Supplement'},
'\\u2d30-\\u2d7f': {'start_code_10': '11568', 'end_code_10': '11647', 'start_code_16': '2D30',
'end_code_16': '2D7F', 'char_count': '80', 'code_type_zh': '提非纳文', 'code_type_en': 'Tifinagh'},
'\\u2d80-\\u2ddf': {'start_code_10': '11648', 'end_code_10': '11743', 'start_code_16': '2D80',
'end_code_16': '2DDF', 'char_count': '96', 'code_type_zh': '埃塞俄比亚语扩展',
'code_type_en': 'Ethiopic Extended'},
'\\u2e00-\\u2e7f': {'start_code_10': '11776', 'end_code_10': '11903', 'start_code_16': '2E00',
'end_code_16': '2E7F', 'char_count': '128', 'code_type_zh': '追加标点',
'code_type_en': 'Supplemental Punctuation'},
'\\u2e80-\\u2eff': {'start_code_10': '11904', 'end_code_10': '12031', 'start_code_16': '2E80',
'end_code_16': '2EFF', 'char_count': '128', 'code_type_zh': 'CJK 部首补充',
'code_type_en': 'CJK Radicals Supplement'},
'\\u2f00-\\u2fdf': {'start_code_10': '12032', 'end_code_10': '12255', 'start_code_16': '2F00',
'end_code_16': '2FDF', 'char_count': '224', 'code_type_zh': '康熙字典部首',
'code_type_en': 'Kangxi Radicals'},
'\\u2ff0-\\u2fff': {'start_code_10': '12272', 'end_code_10': '12287', 'start_code_16': '2FF0',
'end_code_16': '2FFF', 'char_count': '16', 'code_type_zh': '表意文字描述符',
'code_type_en': 'Ideographic Description Characters'},
'\\u3000-\\u303f': {'start_code_10': '12288', 'end_code_10': '12351', 'start_code_16': '3000',
'end_code_16': '303F', 'char_count': '64', 'code_type_zh': 'CJK 符号和标点',
'code_type_en': 'CJK Symbols and Punctuation'},
'\\u3040-\\u309f': {'start_code_10': '12352', 'end_code_10': '12447', 'start_code_16': '3040',
'end_code_16': '309F', 'char_count': '96', 'code_type_zh': '日文平假名', 'code_type_en': 'Hiragana'},
'\\u30a0-\\u30ff': {'start_code_10': '12448', 'end_code_10': '12543', 'start_code_16': '30A0',
'end_code_16': '30FF', 'char_count': '96', 'code_type_zh': '日文片假名', 'code_type_en': 'Katakana'},
'\\u3100-\\u312f': {'start_code_10': '12544', 'end_code_10': '12591', 'start_code_16': '3100',
'end_code_16': '312F', 'char_count': '48', 'code_type_zh': '注音字母', 'code_type_en': 'Bopomofo'},
'\\u3130-\\u318f': {'start_code_10': '12592', 'end_code_10': '12687', 'start_code_16': '3130',
'end_code_16': '318F', 'char_count': '96', 'code_type_zh': '朝鲜文兼容字母',
'code_type_en': 'Hangul Compatibility Jamo'},
'\\u3190-\\u319f': {'start_code_10': '12688', 'end_code_10': '12703', 'start_code_16': '3190',
'end_code_16': '319F', 'char_count': '16', 'code_type_zh': '象形字注释标志', 'code_type_en': 'Kanbun'},
'\\u31a0-\\u31bf': {'start_code_10': '12704', 'end_code_10': '12735', 'start_code_16': '31A0',
'end_code_16': '31BF', 'char_count': '32', 'code_type_zh': '注音字母扩展',
'code_type_en': 'Bopomofo Extended'},
'\\u31c0-\\u31ef': {'start_code_10': '12736', 'end_code_10': '12783', 'start_code_16': '31C0',
'end_code_16': '31EF', 'char_count': '48', 'code_type_zh': 'CJK 笔画',
'code_type_en': 'CJK Strokes'},
'\\u31f0-\\u31ff': {'start_code_10': '12784', 'end_code_10': '12799', 'start_code_16': '31F0',
'end_code_16': '31FF', 'char_count': '16', 'code_type_zh': '日文片假名语音扩展',
'code_type_en': 'Katakana Phonetic Extensions'},
'\\u3200-\\u32ff': {'start_code_10': '12800', 'end_code_10': '13055', 'start_code_16': '3200',
'end_code_16': '32FF', 'char_count': '256', 'code_type_zh': '封闭式 CJK 文字和月份',
'code_type_en': 'Enclosed CJK Letters and Months'},
'\\u3300-\\u33ff': {'start_code_10': '13056', 'end_code_10': '13311', 'start_code_16': '3300',
'end_code_16': '33FF', 'char_count': '256', 'code_type_zh': 'CJK 兼容',
'code_type_en': 'CJK Compatibility'},
'\\u3400-\\u4dbf': {'start_code_10': '13312', 'end_code_10': '19903', 'start_code_16': '3400',
'end_code_16': '4DBF', 'char_count': '6592', 'code_type_zh': 'CJK 统一表意符号扩展 A',
'code_type_en': 'CJK Unified Ideographs Extension A'},
'\\u4dc0-\\u4dff': {'start_code_10': '19904', 'end_code_10': '19967', 'start_code_16': '4DC0',
'end_code_16': '4DFF', 'char_count': '64', 'code_type_zh': '易经六十四卦符号',
'code_type_en': 'Yijing Hexagrams Symbols'},
'\\u4e00-\\u9fbf': {'start_code_10': '19968', 'end_code_10': '40895', 'start_code_16': '4E00',
'end_code_16': '9FBF', 'char_count': '20928', 'code_type_zh': 'CJK 统一表意符号',
'code_type_en': 'CJK Unified Ideographs'},
'\\ua000-\\ua48f': {'start_code_10': '40960', 'end_code_10': '42127', 'start_code_16': 'A000',
'end_code_16': 'A48F', 'char_count': '1168', 'code_type_zh': '彝文音节',
'code_type_en': 'Yi Syllables'},
'\\ua490-\\ua4cf': {'start_code_10': '42128', 'end_code_10': '42191', 'start_code_16': 'A490',
'end_code_16': 'A4CF', 'char_count': '64', 'code_type_zh': '彝文字根',
'code_type_en': 'Yi Radicals'},
'\\ua500-\\ua61f': {'start_code_10': '42240', 'end_code_10': '42527', 'start_code_16': 'A500',
'end_code_16': 'A61F', 'char_count': '288', 'code_type_zh': 'Vai', 'code_type_en': 'Vai'},
'\\ua660-\\ua6ff': {'start_code_10': '42592', 'end_code_10': '42751', 'start_code_16': 'A660',
'end_code_16': 'A6FF', 'char_count': '160', 'code_type_zh': '统一加拿大土著语音节补充',
'code_type_en': 'Unified Canadian Aboriginal Syllabics Supplement'},
'\\ua700-\\ua71f': {'start_code_10': '42752', 'end_code_10': '42783', 'start_code_16': 'A700',
'end_code_16': 'A71F', 'char_count': '32', 'code_type_zh': '声调修饰字母',
'code_type_en': 'Modifier Tone Letters'},
'\\ua720-\\ua7ff': {'start_code_10': '42784', 'end_code_10': '43007', 'start_code_16': 'A720',
'end_code_16': 'A7FF', 'char_count': '224', 'code_type_zh': '拉丁文扩展-D',
'code_type_en': 'Latin Extended-D'},
'\\ua800-\\ua82f': {'start_code_10': '43008', 'end_code_10': '43055', 'start_code_16': 'A800',
'end_code_16': 'A82F', 'char_count': '48', 'code_type_zh': 'Syloti Nagri',
'code_type_en': 'Syloti Nagri'},
'\\ua840-\\ua87f': {'start_code_10': '43072', 'end_code_10': '43135', 'start_code_16': 'A840',
'end_code_16': 'A87F', 'char_count': '64', 'code_type_zh': '八思巴字', 'code_type_en': 'Phags-pa'},
'\\ua880-\\ua8df': {'start_code_10': '43136', 'end_code_10': '43231', 'start_code_16': 'A880',
'end_code_16': 'A8DF', 'char_count': '96', 'code_type_zh': 'Saurashtra',
'code_type_en': 'Saurashtra'},
'\\ua900-\\ua97f': {'start_code_10': '43264', 'end_code_10': '43391', 'start_code_16': 'A900',
'end_code_16': 'A97F', 'char_count': '128', 'code_type_zh': '爪哇语', 'code_type_en': 'Javanese'},
'\\ua980-\\ua9df': {'start_code_10': '43392', 'end_code_10': '43487', 'start_code_16': 'A980',
'end_code_16': 'A9DF', 'char_count': '96', 'code_type_zh': 'Chakma', 'code_type_en': 'Chakma'},
'\\uaa00-\\uaa3f': {'start_code_10': '43520', 'end_code_10': '43583', 'start_code_16': 'AA00',
'end_code_16': 'AA3F', 'char_count': '64', 'code_type_zh': 'Varang Kshiti',
'code_type_en': 'Varang Kshiti'},
'\\uaa40-\\uaa6f': {'start_code_10': '43584', 'end_code_10': '43631', 'start_code_16': 'AA40',
'end_code_16': 'AA6F', 'char_count': '48', 'code_type_zh': 'Sorang Sompeng',
'code_type_en': 'Sorang Sompeng'},
'\\uaa80-\\uaadf': {'start_code_10': '43648', 'end_code_10': '43743', 'start_code_16': 'AA80',
'end_code_16': 'AADF', 'char_count': '96', 'code_type_zh': 'Newari', 'code_type_en': 'Newari'},
'\\uab00-\\uab5f': {'start_code_10': '43776', 'end_code_10': '43871', 'start_code_16': 'AB00',
'end_code_16': 'AB5F', 'char_count': '96', 'code_type_zh': '越南傣语', 'code_type_en': 'Vi?t Thái'},
'\\uab80-\\uaba0': {'start_code_10': '43904', 'end_code_10': '43936', 'start_code_16': 'AB80',
'end_code_16': 'ABA0', 'char_count': '33', 'code_type_zh': 'Kayah Li',
'code_type_en': 'Kayah Li'},
'\\uac00-\\ud7af': {'start_code_10': '44032', 'end_code_10': '55215', 'start_code_16': 'AC00',
'end_code_16': 'D7AF', 'char_count': '11184', 'code_type_zh': '朝鲜文音节',
'code_type_en': 'Hangul Syllables'},
'\\ud800-\\udbff': {'start_code_10': '55296', 'end_code_10': '56319', 'start_code_16': 'D800',
'end_code_16': 'DBFF', 'char_count': '1024', 'code_type_zh': 'High-half zone of UTF-16',
'code_type_en': 'High-half zone of UTF-16'},
'\\udc00-\\udfff': {'start_code_10': '56320', 'end_code_10': '57343', 'start_code_16': 'DC00',
'end_code_16': 'DFFF', 'char_count': '1024', 'code_type_zh': 'Low-half zone of UTF-16',
'code_type_en': 'Low-half zone of UTF-16'},
'\\ue000-\\uf8ff': {'start_code_10': '57344', 'end_code_10': '63743', 'start_code_16': 'E000',
'end_code_16': 'F8FF', 'char_count': '6400', 'code_type_zh': '自行使用區域',
'code_type_en': 'Private Use Zone'},
'\\uf900-\\ufaff': {'start_code_10': '63744', 'end_code_10': '64255', 'start_code_16': 'F900',
'end_code_16': 'FAFF', 'char_count': '512', 'code_type_zh': 'CJK 兼容象形文字',
'code_type_en': 'CJK Compatibility Ideographs'},
'\\ufb00-\\ufb4f': {'start_code_10': '64256', 'end_code_10': '64335', 'start_code_16': 'FB00',
'end_code_16': 'FB4F', 'char_count': '80', 'code_type_zh': '字母表達形式',
'code_type_en': 'Alphabetic Presentation Form'},
'\\ufb50-\\ufdff': {'start_code_10': '64336', 'end_code_10': '65023', 'start_code_16': 'FB50',
'end_code_16': 'FDFF', 'char_count': '688', 'code_type_zh': '阿拉伯表達形式A',
'code_type_en': 'Arabic Presentation Form-A'},
'\\ufe00-\\ufe0f': {'start_code_10': '65024', 'end_code_10': '65039', 'start_code_16': 'FE00',
'end_code_16': 'FE0F', 'char_count': '16', 'code_type_zh': '变量选择符',
'code_type_en': 'Variation Selector'},
'\\ufe10-\\ufe1f': {'start_code_10': '65040', 'end_code_10': '65055', 'start_code_16': 'FE10',
'end_code_16': 'FE1F', 'char_count': '16', 'code_type_zh': '竖排形式',
'code_type_en': 'Vertical Forms'},
'\\ufe20-\\ufe2f': {'start_code_10': '65056', 'end_code_10': '65071', 'start_code_16': 'FE20',
'end_code_16': 'FE2F', 'char_count': '16', 'code_type_zh': '组合用半符号',
'code_type_en': 'Combining Half Marks'},
'\\ufe30-\\ufe4f': {'start_code_10': '65072', 'end_code_10': '65103', 'start_code_16': 'FE30',
'end_code_16': 'FE4F', 'char_count': '32', 'code_type_zh': 'CJK 兼容形式',
'code_type_en': 'CJK Compatibility Forms'},
'\\ufe50-\\ufe6f': {'start_code_10': '65104', 'end_code_10': '65135', 'start_code_16': 'FE50',
'end_code_16': 'FE6F', 'char_count': '32', 'code_type_zh': '小型变体形式',
'code_type_en': 'Small Form Variants'},
'\\ufe70-\\ufeff': {'start_code_10': '65136', 'end_code_10': '65279', 'start_code_16': 'FE70',
'end_code_16': 'FEFF', 'char_count': '144', 'code_type_zh': '阿拉伯表達形式B',
'code_type_en': 'Arabic Presentation Form-B'},
'\\uff00-\\uffef': {'start_code_10': '65280', 'end_code_10': '65519', 'start_code_16': 'FF00',
'end_code_16': 'FFEF', 'char_count': '240', 'code_type_zh': '半型及全型形式',
'code_type_en': 'Halfwidth and Fullwidth Form'},
'\\ufff0-\\uffff': {'start_code_10': '65520', 'end_code_10': '65535', 'start_code_16': 'FFF0',
'end_code_16': 'FFFF', 'char_count': '16', 'code_type_zh': '特殊', 'code_type_en': 'Specials'}}
获取字符串编码
def get_lang_code(content):
code_li = []
for code, v in unicode_dic.items():
if re.findall("[%s]+" % code, content):
code_li.append(code)
return code_li