#! /usr/bin/env python3
from m_lib.net.www.html import HTMLParser as _HTMLParser
class HTMLHeadDone(Exception): pass
class HTMLParser(_HTMLParser):
def end_head(self):
raise HTMLHeadDone()
def do_meta(self, attrs):
http_equiv = ""
content = ""
for attrname, value in attrs:
if value:
value = value.strip()
if attrname == 'http-equiv':
http_equiv = value.lower()
elif attrname == 'content':
content = value
if http_equiv == "content-type":
try:
# extract charset from "text/html; foo; charset=UTF-8; bar;"
self.charset = content.lower().split('charset=')[1].split(';')[0]
except IndexError:
pass
raise HTMLHeadDone()
def parse_html(filename):
infile = open(filename, 'r')
parser = HTMLParser()
for line in infile:
try:
parser.feed(line)
except HTMLHeadDone:
break
infile.close()
try:
parser.close()
except HTMLHeadDone:
pass
if hasattr(parser, "charset"):
parser.charset = parser.charset.replace("windows-", "cp").lower()
return parser
if __name__ == '__main__':
try:
import sys
parser = parse_html(sys.argv[1])
if hasattr(parser, "charset"):
print(parser.charset)
else:
import chardet
charset = chardet.detect(open(sys.argv[1]).read())["encoding"]
if charset in ("ISO-8859-2", "MacCyrillic"):
charset = "cp1251"
print(charset)
except:
pass