Source code for wcf.xml2records

#!/usr/bin/env python2
# vim: set ts=4 sw=4 tw=79 fileencoding=utf-8:

from __future__ import absolute_import

from wcf.MyHTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
import re
import base64
import logging

log = logging.getLogger(__name__)

from wcf.records import *
from wcf.dictionary import inverted_dict


classes = Record.records.values()
classes = dict([(c.__name__, c) for c in classes])
#inverted_dict = dict([(n,v) for n,v in inverted_dict.iteritems()])


int_reg = re.compile(r'^-?\d+$')
uint_reg = re.compile(r'^\d+$')
uuid_reg = re.compile(r'^(([a-fA-F0-9]{8})-(([a-fA-F0-9]{4})-){3}([a-fA-F0-9]{12}))$')
uniqueid_reg = re.compile(r'^urn:uuid:(([a-fA-F0-9]{8})-(([a-fA-F0-9]{4})-){3}([a-fA-F0-9]{12}))$')
base64_reg = re.compile(r'^[a-zA-Z0-9/+]*={0,2}$')
float_reg = re.compile(r'^-?(INF)|(NaN)|(\d+(\.\d+)?)$')
datetime_reg = re.compile(r'^\d{4}-\d{2}-\d{2}(T\d{2}:\d{2}:\d{2}(\.\d{1,7})?)?(Z|(\+|-\d{2}:\d{2}))')


[docs]class XMLParser(HTMLParser): def reset(self): HTMLParser.reset(self) self.records = [] self.last_record = Record() self.last_record.childs = self.records self.last_record.parent = None self.data = None def _parse_tag(self, tag): if ':' in tag: prefix = tag[:tag.find(':')] name = tag[tag.find(':')+1:] if len(prefix) == 1: cls_name = 'Element' + prefix.upper() + 'Record' if name in inverted_dict: cls_name = 'PrefixDictionary' + cls_name log.debug('New %s: %s' % (cls_name, name)) return classes[cls_name](inverted_dict[name]) else: cls_name = 'Prefix' + cls_name log.debug('New %s: %s' % (cls_name, name)) return classes[cls_name](name) else: if name in inverted_dict: log.debug('New DictionaryElementRecord: %s:%s' % (prefix, name)) return DictionaryElementRecord(prefix, inverted_dict[name]) else: log.debug('New ElementRecord: %s:%s' % (prefix, name)) return ElementRecord(prefix, name) else: if tag in inverted_dict: log.debug('New ShortDictionaryElementRecord: %s' % (tag, )) return ShortDictionaryElementRecord(inverted_dict[tag]) else: log.debug('New ShortElementRecord: %s' % (tag, )) return ShortElementRecord(tag) def _store_data(self, data, end=False): textrecord = self._parse_data(data) if isinstance(textrecord, EmptyTextRecord): return log.debug('New %s: %s' % (type(textrecord).__name__, data)) self.last_record.childs.append(textrecord) #if end: # textrecord.type += 1 def _parse_data(self, data): data = data.strip() b64 = False try: if base64_reg.match(data): base64.b64decode(data) b64 = True except: b64 = False if data == '0': return ZeroTextRecord() elif data == '1': return OneTextRecord() elif data.lower() == 'false': return FalseTextRecord() elif data.lower() == 'true': return TrueTextRecord() elif len(data) > 3 and data[1] == ':' and data[2:] in inverted_dict: return QNameDictionaryTextRecord(data[0], inverted_dict[data[2:]]) elif uniqueid_reg.match(data): m = uniqueid_reg.match(data) return UniqueIdTextRecord(m.group(1)) elif uuid_reg.match(data): m = uuid_reg.match(data) return UuidTextRecord(m.group(1)) elif int_reg.match(data): val = int(data) if val < 2**8: return Int8TextRecord(val) elif val < 2**16: return Int16TextRecord(val) elif val < 2**32: return Int32TextRecord(val) elif val < 2**64: return Int64TextRecord(val) elif data == '': return EmptyTextRecord() elif b64: data = base64.b64decode(data) val = len(data) if val < 2**8: return Bytes8TextRecord(data) elif val < 2**16: return Bytes16TextRecord(data) elif val < 2**32: return Bytes32TextRecord(data) elif float_reg.match(data): return DoubleTextRecord(float(data)) elif data in inverted_dict: return DictionaryTextRecord(inverted_dict[data]) elif datetime_reg.match(data) and False: # TODO t = data.split('Z') tz = 0 if len(t) > 1: dt = t[0] tz = 1 if len(tz[1]) else 2 dt = t[0] dt = dt.split('.') ns = 0 if len(dt) > 1: ns = int(dt[1]) dt = dt[0] if len(dt) == 10: dt = datetime.datetime.strptime(dt, "%Y-%m-%d") elif len(dt) == 16: dt = datetime.datetime.strptime(dt, "%Y-%m-%dT%H:%M") else: dt = datetime.datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S") base_diff = 62135596800.0 dt = int((time.mktime(dt.timetuple()) - base) * 10 + ms) return DateTimeTextRecord(dt, tz) else: val = len(data) if val < 2**8: return Chars8TextRecord(data) elif val < 2**16: return Chars16TextRecord(data) elif val < 2**32: return Chars32TextRecord(data) def _parse_attr(self, name, value): if ':' in name: prefix = name[:name.find(':')] name = name[name.find(':')+1:] if prefix == 'xmlns': if value in inverted_dict: return DictionaryXmlnsAttributeRecord(name, inverted_dict[value]) else: return XmlnsAttributeRecord(name, value) elif len(prefix) == 1: value = self._parse_data(value) cls_name = 'Attribute' + prefix.upper() + 'Record' if name in inverted_dict: return classes['PrefixDictionary' + cls_name](inverted_dict[name], value) else: return classes['Prefix' + cls_name](name, value) else: value = self._parse_data(value) if name in inverted_dict: return DictionaryAttributeRecord(prefix, inverted_dict[name], value) else: return AttributeRecord(prefix, name, value) elif name == 'xmlns': if value in inverted_dict: return ShortDictionaryXmlnsAttributeRecord(inverted_dict[value]) else: return ShortXmlnsAttributeRecord(value) else: value = self._parse_data(value) if name in inverted_dict: return ShortDictionaryAttributeRecord(inverted_dict[name], value) else: return ShortAttributeRecord(name, value) def handle_starttag(self, tag, attrs): if self.data: self._store_data(self.data, False) self.data = None el = self._parse_tag(tag) for n, v in attrs: el.attributes.append(self._parse_attr(n, v)) self.last_record.childs.append(el) el.parent = self.last_record self.last_record = el def handle_startendtag(self, tag, attrs): if self.data: self._store_data(self.data, False) self.data = None el = self._parse_tag(tag) for n, v in attrs: el.attributes.append(self._parse_attr(n, v)) self.last_record.childs.append(el) #self.last_record.childs.append(EndElementRecord()) def handle_endtag(self, tag): if self.data: self._store_data(self.data, True) self.data = None else: pass#self.last_record.childs.append(EndElementRecord()) self.last_record = self.last_record.parent def handle_data(self, data): if not self.data: self.data = data else: self.data += data def handle_charref(self, name): if name[0] == 'x': self.handle_data(chr(int(name[1:], 16))) else: self.handle_data(chr(int(name, 10))) def handle_entityref(self, name): self.handle_data(self.unescape("&%s;" % name)) handle_decl = handle_data def handle_comment(self, comment): if data: self._store_data(self.data, False) self.data = None self.last_record.childs.append(CommentRecord(comment)) def parse_marked_section(self, i, report=1): from markupbase import _markedsectionclose, _msmarkedsectionclose rawdata= self.rawdata assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" sectName, j = self._scan_name( i+3, i ) if j < 0: return j if sectName in ("temp", "cdata", "ignore", "include", "rcdata"): # look for standard ]]> ending match= _markedsectionclose.search(rawdata, i+3) elif sectName in ("if", "else", "endif"): # look for MS Office ]> ending match= _msmarkedsectionclose.search(rawdata, i+3) else: self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) if not match: return -1 if report: if sectName == "cdata": assert rawdata[j] == '[' self.handle_data(rawdata[j+1:match.start(0)]) else: j = match.start(0) self.unknown_decl(rawdata[i+3: j]) return match.end(0) @classmethod
[docs] def parse(cls, data): """ Parses a XML String/Fileobject into a Record tree :param data: a XML string or fileobject :returns: a Record tree >>> from wcf.records import dump_records, print_records >>> from wcf.xml2records import XMLParser >>> r = XMLParser.parse('<s:Envelope><b:Body /></s:Envelope>') >>> dump_records(r) 'V\\x02E\\x0e\\x01\\x01' >>> b = print_records(r) <s:Envelope > <b:Body ></b:Body> </s:Envelope> """ p = cls() xml = None if isinstance(data, str): xml = data elif hasattr(data, 'read'): xml = data.read() else: raise ValueError("%s has an incompatible type %s" % (data, type(data))) p.feed(xml) return p.records
if __name__ == '__main__': import sys fp = sys.stdin if len(sys.argv) > 1: fp = open(sys.argv[1], 'r') logging.basicConfig(level=logging.INFO) p = XMLParser() indata = fp.read()#.strip() fp.close() p.feed(indata) sys.stdout.write(dump_records(p.records))