I'd like to illustrate and suggest a fix by showing a
simple python file (which was named htmllib2.py so you
can uncomment the line in the doctest case to see that
my fix works). It's more like a hack than the fix though:
#!/usr/bin/env python2.4
"""
Use this instead of htmllib for having entitydefs
substituted in attributes,too.
Example:
>>> import htmllib
# >>> import htmllib2 as htmllib
>>> import formatter
>>> import StringIO
>>> s = StringIO.StringIO()
>>> p =
htmllib.HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(s)))
>>> p.feed('<img alt="<>&">')
>>> s.getvalue()
'<>&'
"""
__all__ = ("HTMLParser",)
import htmllib
from htmlentitydefs import name2codepoint as entitytable
entitytable = dict([(k, chr(v)) for k, v in
entitytable.items() if v < 256])
def entitysub(s):
ret = ""
state = ""
for c in s:
if state.startswith('&'):
if c == ';':
ret += entitytable.get(state[1:], '%s;'
% state)
state = ""
else:
state += c
elif c == '&':
state = c
else:
ret += c
return ret
class HTMLParser(htmllib.HTMLParser):
def handle_starttag(self, tag, method, attrs):
"""Repair attribute values."""
attrs = [(k, entitysub(v)) for (k, v) in attrs]
method(attrs)
if __name__ == '__main__':
import doctest
doctest.testmod()
|