Revision: 10974
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at January 20, 2009 08:00 by tamuratetsuya
Initial Code
#!-*- coding:utf-8 -*-
import htmlentitydefs
import re
# 実体å‚ç…§ & æ–‡å—å‚ç…§ã‚’é€šå¸¸ã®æ–‡å—ã«æˆ»ã™
def htmlentity2unicode(text):
# æ£è¦è¡¨ç¾ã®ã‚³ãƒ³ãƒ‘イル
reference_regex = re.compile(u'&(#x?[0-9a-f]+|[a-z]+);', re.IGNORECASE)
num16_regex = re.compile(u'#x\d+', re.IGNORECASE)
num10_regex = re.compile(u'#\d+', re.IGNORECASE)
result = u''
i = 0
while True:
# 実体å‚ç…§ or æ–‡å—å‚照を見ã¤ã‘ã‚‹
match = reference_regex.search(text, i)
if match is None:
result += text[i:]
break
result += text[i:match.start()]
i = match.end()
name = match.group(1)
# 実体å‚ç…§
if name in htmlentitydefs.name2codepoint.keys():
result += unichr(htmlentitydefs.name2codepoint[name])
# æ–‡å—å‚ç…§
elif num16_regex.match(name):
# 16進数
result += unichr(int(u'0'+name[1:], 16))
elif num10_regex.match(name):
# 10進数
result += unichr(int(name[1:]))
return result
# テストコード
text = u"文字参照 & 実体参照 ã‚’é€šå¸¸ã®æ–‡å—ã«æˆ»ã—ã¾ã™ã€‚";
print htmlentity2unicode(text)
Initial URL
Initial Description
Initial Title
数値文å—å‚ç…§ã‹ã‚‰æ–‡å—ã¸å¤‰æ›
Initial Tags
python
Initial Language
Python