Module preParser
[hide private]
[frames] | no frames]

Source Code for Module preParser

  1   
  2  from baseObjects import PreParser 
  3  from document import StringDocument 
  4  import re, gzip, string, binascii, cStringIO as StringIO 
  5  import bz2 
  6  import httplib, mimetypes, tempfile, os, commands, time 
  7  from PyZ3950.zmarc import MARC 
  8  from xml.sax.saxutils import escape 
  9   
 10  # XXX All PreParsers should set mimetype, and record in/out mimetype 
 11   
 12   
 13  # --- Wrapper --- 
 14   
15 -class NormalizerPreParser(PreParser):
16 """ Calls a named Normalizer to do the conversion """ 17
18 - def __init__(self, session, config, parent):
19 PreParser.__init__(self, session, config, parent) 20 self.normalizer = self.get_path(session, 'normalizer', None) 21 if self.normalizer == None: 22 raise ConfigFileException("Normalizer for %s does not exist." % self.id)
23
24 - def process_document(self, session, doc):
25 data = doc.get_raw() 26 new = self.normalizer.process_string(session, data) 27 return StringDocument(new)
28 29 30 31 # --- HTML PreParsers --- 32
33 -class HtmlSmashPreParser(PreParser):
34 """ Attempts to reduce HTML to its raw text """ 35
36 - def __init__(self, session, config, parent):
37 PreParser.__init__(self, session, config, parent) 38 self.body = re.compile('<body(.*?)</body>', re.S | re.I) 39 self.tagstrip = re.compile('<[^>]+>') 40 self.title = re.compile('<title[^>]*>(.+?)</title>', re.S | re.I) 41 self.script = re.compile('<script(.*?)</script>', re.S | re.I) 42 self.style = re.compile('<style(.*?)</style>', re.S | re.I) 43 self.comment = re.compile('<!--(.*?)-->', re.S | re.I)
44
45 - def process_document(self, session, doc):
46 data = self.script.sub('', doc.get_raw()) 47 data = self.style.sub('', data) 48 data = self.comment.sub('', data) 49 tm = self.title.search(data) 50 if tm: 51 title = data[tm.start():tm.end()] 52 else: 53 title = "" 54 m = self.body.search(data) 55 if m: 56 body = data[m.start():m.end()] 57 else: 58 body = data 59 text = self.tagstrip.sub(' ', body) 60 text = text.replace('<', '&lt;') 61 text = text.replace('>', '&gt;') 62 text = text.replace("&nbsp;", ' ') 63 text = text.replace("&nbsp", ' ') 64 65 l = text.split() 66 text = ' '.join(l) 67 data = "<html><head>%s</head><body>%s</body></html>" % (title, text) 68 return StringDocument(data)
69 70
71 -class RegexpSmashPreParser(PreParser):
72 """ Either strip, replace or keep data which matches a given regular expression """
73 - def __init__(self, session, config, parent):
74 PreParser.__init__(self, session, config, parent) 75 char = self.get_setting(session, 'char') 76 regex = self.get_setting(session, 'regexp') 77 self.keep = self.get_setting(session, 'keep') 78 if regex: 79 self.regexp = re.compile(regex, re.S) 80 if char: 81 self.char = char 82 else: 83 self.char = ''
84
85 - def process_document(self, session, doc):
86 data = doc.get_raw() 87 if self.keep: 88 l = self.regexp.findall(data) 89 if l and l[0] and type(l[0]) == tuple: 90 r = [] 91 for e in l: 92 r.append(e[0]) 93 l = r 94 d2 = self.char.join(l) 95 else: 96 d2 = self.regexp.sub(self.char, data) 97 return StringDocument(d2)
98 99 try: 100 import tidy 101 102 class HtmlTidyPreParser(PreParser): 103 """ Uses TidyLib to turn HTML into XHTML for parsing """ 104 def process_document(self, session, doc): 105 d = tidy.parseString(doc.get_raw(), output_xhtml=1, add_xml_decl=0, tidy_mark=0, indent=0) 106 return StringDocument(str(d))
107 except: 108 109 class HtmlTidyPreParser(PreParser): 110 """ Calls Tidy utility to turn HTML into XHTML for parsing """ 111 def __init__(self, session, server, config): 112 PreParser.__init__(self, session, server, config) 113 tidyPath = self.get_path(session, 'tidy') 114 if not tidyPath: 115 tidyPath = commands.getoutput('which tidy') 116 self.cmd = tidyPath + " -asxml" 117 118 def process_document(self, session, doc): 119 # Write doc to tidy 120 (qq, fn) = tempfile.mkstemp() 121 fh = file(fn, 'w') 122 fh.write(doc.get_raw()) 123 fh.close() 124 commands.getoutput(self.cmd + ' -m ' + fn) 125 i = file(fn) 126 data = i.read() 127 i.close() 128 os.remove(fn) 129 130 data = data.replace('<!doctype', '<!DOCTYPE') 131 return StringDocument(data) 132 133
134 -class TagStripPreParser(PreParser):
135 """ Strip only named tags from the document eg script, style""" 136
137 - def process_document(self, session, doc):
138 # XXX To config 139 s = re.compile('<script(.*?)</script>', re.S) 140 data = s.sub('', doc.get_raw()) 141 return StringDocument(data)
142 143 144 145 146 # --- PDF PreParsers --- 147 148
149 -class PdfToXmlPreParser(PreParser):
150 """ pdftohtml wrapper to turn PDF into XML """ 151
152 - def process_document(self, session, doc):
153 (qqq, fn) = tempfile.mkstemp('.pdf') 154 fh = file(fn, 'w') 155 fh.write(doc.get_raw()) 156 fh.close() 157 cmd = "pdftohtml -xml -stdout %s" % fn 158 (i, o, err) = os.popen3(cmd) 159 data = o.read() 160 os.remove(fn) 161 data = data.replace('<A ', '<a ') 162 data = data.replace('<i>', '') 163 data = data.replace('</i>', '') 164 data = data.replace('<b>', '') 165 data = data.replace('</b>', '') 166 doc = StringDocument(data) 167 doc.tag = 'page' 168 return doc
169 170
171 -class PdfToTxtPreParser(PreParser):
172 """ Convert PDF to text via pdftotext utility """ 173 174 inMimeType = "application/pdf" 175 outMimeType = "text/plain" 176
177 - def __init__(self, session, server, parent):
178 PreParser.__init__(self, session, server, parent) 179 self.pagenum = re.compile('\n[0-9]+\n') 180 p2t = self.get_path(session, "pdftotext") 181 if not p2t: 182 p2t = commands.getoutput('which pdftotext') 183 self.cmd = p2t + " -raw %s %s"
184 - def process_document(self, session, doc):
185 (a, intf) = tempfile.mkstemp() 186 (b, outtf) = tempfile.mkstemp() 187 writeh = file(intf, 'w') 188 writeh.write(doc.get_raw()) 189 writeh.close() 190 cmd = self.cmd % (intf, outtf) 191 (stdin, stdout, stderr) = os.popen3(cmd) 192 error = stderr.read() 193 if (not error): 194 inh = file(outtf) 195 txt = inh.read() 196 inh.close() 197 else: 198 txt = "" 199 os.remove(outtf) 200 os.remove(intf) 201 # Strip page numbers 202 txt = pagenums.sub('\n', txt) 203 return StringDocument(txt, self.id, doc.processHistory, 'text/plain', doc.parent)
204 205 206 207 # --- Not Quite Xml PreParsers --- 208
209 -class SgmlPreParser(PreParser):
210 """ Convert SGML into XML """ 211 entities = {} 212 emptyTags = [] 213 doctype_re = None 214 attr_re = None 215 elem_re = None 216 amp_re = None 217 inMimeType = "text/sgml" 218 outMimeType = "text/xml" 219
220 - def __init__(self, session, server, config):
221 222 PreParser.__init__(self, session, server, config) 223 self.doctype_re = (re.compile('<!DOCTYPE (.+?)"(.+?)">')) 224 self.attr_re = re.compile(' ([a-zA-Z0-9_]+)[ ]*=[ ]*([-:_.a-zA-Z0-9]+)([ >])') 225 self.pi_re = re.compile("<\?(.*?)\?>") 226 self.elem_re = re.compile('(<[/]?)([a-zA-Z0-9_]+)') 227 self.amp_re = re.compile('&(\s)') 228 taglist = self.get_setting(None, 'emptyElements') 229 if taglist: 230 self.emptyTags = taglist.split()
231
232 - def _loneAmpersand(self, match):
233 return '&amp;%s' % match.group(1)
234 - def _lowerElement(self, match):
235 #return match.groups()[0] + match.groups()[1].lower() 236 return "%s%s" % (match.group(1), match.group(2).lower())
237 - def _attributeFix(self, match):
238 #return match.groups()[0].lower() + '="' + match.groups()[1] + '"' 239 return ' %s="%s"%s' % (match.group(1).lower(), match.group(2), match.group(3))
240 - def _emptyElement(self, match):
241 return "<%s/>" % (match.group(1))
242
243 - def process_document(self, session, doc):
244 txt = doc.get_raw() 245 246 txt = txt.replace('\n', ' ') 247 txt = txt.replace('\r', ' ') 248 for x in range(9, 14): 249 txt = txt.replace('&#%d;' % (x), ' ') 250 251 txt = self.doctype_re.sub('', txt) 252 for e in self.entities.keys(): 253 txt = txt.replace("&%s;" % (e), self.entities[e]) 254 255 txt = self.amp_re.sub(self._loneAmpersand, txt) 256 txt = txt.replace('&<', '&amp;<') 257 txt = self.attr_re.sub(self._attributeFix, txt) 258 txt = self.elem_re.sub(self._lowerElement, txt) 259 for t in self.emptyTags: 260 empty_re = re.compile('<(%s( [^>]+)?)[\s/]*>' % t) 261 txt = empty_re.sub(self._emptyElement, txt) 262 # strip processing instructions. 263 txt = self.pi_re.sub('', txt) 264 265 return StringDocument(txt, self.id, doc.processHistory, 'text/xml', doc.parent)
266 267 268 269 # XXX Should this be normalizerPreParser
270 -class AmpPreParser(PreParser):
271 """ Escape lone ampersands in otherwise XML text """ 272 entities = {} 273
274 - def __init__(self, session, server, config):
275 PreParser.__init__(self, session, server, config) 276 self.amp_re = re.compile('&([^\s;]*)(\s|$)') 277 self.entities = {}
278
279 - def _loneAmpersand(self, match):
280 return '&amp;%s ' % match.group(1)
281
282 - def process_document(self, session, d):
283 txt = d.get_raw() 284 for e in self.entities.keys(): 285 txt = txt.replace("&%s;" % (e), self.entities[e]) 286 txt = self.amp_re.sub(self._loneAmpersand, txt) 287 return StringDocument(txt, self.id, d.processHistory, 'text/xml', d.parent)
288 289 290 # --- MARC PreParsers --- 291
292 -class MarcToXmlPreParser(PreParser):
293 """ Convert MARC into MARCXML """ 294 inMimeType = "application/marc" 295 outMimeType = "text/xml" 296
297 - def process_document(self, session, doc):
298 data = doc.get_raw() 299 m = MARC(data) 300 return StringDocument(m.toMARCXML(), self.id, doc.processHistory, 'text/xml', doc.parent)
301
302 -class MarcToSgmlPreParser(PreParser):
303 """ Convert MARC into Cheshire2's MarcSgml """ 304 inMimeType = "application/marc" 305 outMimeType = "text/sgml" 306
307 - def process_document(self, session, doc):
308 data = doc.get_raw() 309 m = MARC(data) 310 return StringDocument(m.toSGML(), self.id, doc.processHistory, 'text/sgml', doc.parent)
311 312 313 # --- Raw Text PreParsers --- 314
315 -class TxtToXmlPreParser(PreParser):
316 """ Minimally wrap text in &lt;data&gt; xml tags """ 317 318 inMimeType = "text/plain" 319 outMimeType = "text/xml" 320
321 - def process_document(self, session, doc):
322 txt = doc.get_raw() 323 txt = escape(txt) 324 return StringDocument("<data>" + txt + "</data>", self.id, doc.processHistory, 'text/xml', doc.parent)
325 326 327 328 # --- Compression PreParsers --- 329
330 -class GzipPreParser(PreParser):
331 """ Gunzip a gzipped document """ 332 inMimeType = "" 333 outMimeType = "" 334
335 - def process_document(self, session, doc):
336 buffer = StringIO.StringIO(doc.get_raw()) 337 zfile = gzip.GzipFile(mode = 'rb', fileobj=buffer) 338 data = zfile.read() 339 return StringDocument(data, self.id, doc.processHistory, parent=doc.parent)
340
341 -class BzipPreParser(PreParser):
342 - def process_document(self, session, doc):
343 buffer = StringIO.StringIO(doc.get_raw()) 344 zfile = bz2.BZ2File(mode = 'rb', fileobj=buffer) 345 data = zfile.read() 346 return StringDocument(data, self.id, doc.processHistory, doc.parent)
347
348 -class B64EncodePreParser(PreParser):
349 """ Encode document in Base64 """ 350
351 - def process_document(self, session, doc):
352 data = doc.get_raw() 353 return StringDocument(binascii.a2b_base64(data), self.id, doc.processHistory, doc.parent)
354 355
356 -class B64DecodePreParser(PreParser):
357 """ Decode document from Base64 """ 358
359 - def process_document(self, session, doc):
360 data = doc.get_raw() 361 return StringDocument(binascii.b2a_base64(data), self.id, doc.processHistory, doc.parent)
362 363 364 365 366 # --- Nasty OpenOffice PreParser --- 367
368 -class UrlPreParser(PreParser):
369
370 - def post_multipart(self, host, selector, fields, files):
371 content_type, body = self.encode_multipart_formdata(fields, files) 372 h = httplib.HTTPConnection(host) 373 headers = {'content-type': content_type} 374 h.request('POST', selector, body, headers) 375 resp = h.getresponse() 376 return resp.read()
377
378 - def encode_multipart_formdata(self, fields, files):
379 BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$' 380 CRLF = '\r\n' 381 L = [] 382 for (key, value) in fields: 383 L.append('--' + BOUNDARY) 384 L.append('Content-Disposition: form-data; name="%s"' % key) 385 L.append('') 386 L.append(value) 387 for (key, filename, value) in files: 388 L.append('--' + BOUNDARY) 389 L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) 390 L.append('Content-Type: %s' % self.get_content_type(filename)) 391 L.append('') 392 L.append(value) 393 L.append('--' + BOUNDARY + '--') 394 L.append('') 395 body = CRLF.join(L) 396 content_type = 'multipart/form-data; boundary=%s' % BOUNDARY 397 return content_type, body
398
399 - def get_content_type(self, filename):
400 return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
401
402 - def send_request(self, session, data=None):
403 url = self.get_path(session, 'RemoteURL') 404 if (url[:7] == "http://"): 405 url = url[7:] 406 hlist = url.split('/', 1) 407 host = hlist[0] 408 if (len(hlist) == 2): 409 selector = hlist[1] 410 else: 411 selector = "" 412 # XXX Remove dependency 413 fields = () 414 files = [("file", "foo.doc", data)] 415 return self.post_multipart(host, selector, fields, files)
416
417 -class OpenOfficePreParser(UrlPreParser):
418 """ Use OpenOffice server to convert documents into OpenDocument XML """ 419 inMimeType = "" 420 outMimeType = "text/xml" 421
422 - def process_document(self, session, doc):
423 data = doc.get_raw() 424 try: 425 xml = self.send_request(session, data) 426 except: 427 xml = "<error/>" 428 return StringDocument(xml, self.id, doc.processHistory, 'text/xml', doc.parent)
429 430 431 432 433 434 # XXX Should be like RegexpNormalizer ? 435
436 -class PrintableOnlyPreParser(PreParser):
437 """ Replace or Strip non printable characters """ 438 439 inMimeType = "text/*" 440 outMimeType = "text/plain" 441
442 - def __init__(self, session, config, parent):
443 PreParser.__init__(self, session, config, parent) 444 self.asciiRe = re.compile('([\x7b-\xff])') 445 self.nonxmlRe = re.compile('([\x00-\x08]|[\x0E-\x1F]|[\x0B\x0C\x1F])') 446 self.strip = self.get_setting(session, 'strip')
447 448 # Strip any non printable characters
449 - def process_document(self, session, doc):
450 data = doc.get_raw() 451 # This is bizarre, but otherwise: 452 # UnicodeDecodeError: 'ascii' codec can't decode byte ... 453 if type(data) == unicode: 454 data = data.replace(u"\xe2\x80\x9c", u'&quot;') 455 data = data.replace(u"\xe2\x80\x9d", u'&quot;') 456 data = data.replace(u"\xe2\x80\x9e", u'&quot;') 457 data = data.replace(u"\xe2\x80\x93", u'-') 458 data = data.replace(u"\xe2\x80\x98", u"'") 459 data = data.replace(u"\xe2\x80\x99", u"'") 460 data = data.replace(u"\xe2\x80\x9a", u",") 461 data = data.replace(u"\x99", u"'") 462 data = data.replace(u'\xa0', u' ') 463 else: 464 data = data.replace("\xe2\x80\x9c", '&quot;') 465 data = data.replace("\xe2\x80\x9d", '&quot;') 466 data = data.replace("\xe2\x80\x9e", '&quot;') 467 data = data.replace("\xe2\x80\x93", '-') 468 data = data.replace("\xe2\x80\x98", "'") 469 data = data.replace("\xe2\x80\x99", "'") 470 data = data.replace("\xe2\x80\x9a", ",") 471 data = data.replace("\x99", "'") 472 data = data.replace('\xa0', ' ') 473 474 475 data = self.nonxmlRe.sub(' ', data) 476 477 if self.strip: 478 return StringDocument(self.asciiRe.sub('', data), self.id, doc.processHistory, doc.mimeType, doc.parent) 479 else: 480 fn = lambda x: "&#%s;" % ord(x.group(1)) 481 return StringDocument(self.asciiRe.sub(fn, data), self.id, doc.processHistory, doc.mimeType, doc.parent)
482 483 484
485 -class CharacterEntityPreParser(PreParser):
486 """ Transform latin-1 and broken character entities into numeric character entities. eg &amp;something; --> &amp;#123; """ 487
488 - def __init__(self, session, config, parent):
489 PreParser.__init__(self, session, config, parent) 490 491 self.numericalEntRe = re.compile('&(\d+);') 492 self.fractionRe = re.compile('&frac(\d)(\d);') 493 self.invalidRe = re.compile('&#(\d|[0-2]\d|3[01]);') 494 495 self.start = 160 496 self.otherEntities = { 497 "quot": '#34', 498 "amp": '#38', 499 "lt": '#60', 500 "gt": '#62', 501 "trade" : '#8482', 502 "OElig": '#338', 503 "oelig": '#339', 504 "Scaron": '#352', 505 "scaron": '#353', 506 "Yuml": '#376', 507 "circ": '#710', 508 "tilde": '#732', 509 "ensp": '#8194', 510 "emsp": '#8195', 511 "thinsp": '#8201', 512 "zwnj": '#8204', 513 "zwj": '#8205', 514 "lrm": '#8206', 515 "rlm": '#8207', 516 "ndash": '#8211', 517 "mdash": '#8212', 518 "lsquo": '#8216', 519 "rsquo": '#8217', 520 "sbquo": '#8218', 521 "ldquo": '#8220', 522 "rdquo": '#8221', 523 "bdquo": '#8222', 524 "dagger": '#8224', 525 "Dagger": '#8225', 526 "permil": '#8240', 527 "lsaquo": '#8249', 528 "rsaquo": '#8250', 529 "euro": '#8364', 530 "rdquo": '#34', 531 "lsquo": '#34', 532 "rsquo": '#34', 533 "half": '#189', 534 "ast": '#8727' 535 } 536 self.inane = { 537 "apos": "'", 538 "hellip": '...', 539 "ldquo": '', 540 "lsqb": '[', 541 "rsqb": ']', 542 "sol": '\\', 543 "commat": '@', 544 "plus": '+', 545 "percnt": '%' 546 } 547 548 self.preEntities = {"OUML;" : "Ouml", "UUML" : "Uuml", "AELIG" : "AElig"} 549 self.entities = ['nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen', 'brvbar', 'sect', 'uml', 'copy', 'ordf', 'laquo', 'not', 'shy', 'reg', 'macr', 'deg', 'plusmn', 'sup2', 'sup3', 'acute', 'micro', 'para', 'middot', 'cedil', 'sup1', 'ordm', 'raquo', 'frac14', 'frac12', 'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc', 'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil','Egrave','Eacute','Ecirc', 'Euml', 'Igrave', 'Iacute','Icirc', 'Iuml', 'ETH', 'Ntilde','Ograve','Oacute','Ocirc', 'Otilde','Ouml', 'times', 'Oslash','Ugrave','Uacute','Ucirc', 'Uuml', 'Yacute','THORN', 'szlig', 'agrave','aacute','acirc', 'atilde','auml', 'aring', 'aelig', 'ccedil','egrave','eacute','ecirc', 'euml', 'igrave', 'iacute','icirc', 'iuml', 'eth', 'ntilde','ograve', 'oacute','ocirc', 'otilde','ouml', 'divide','oslash','ugrave','uacute','ucirc', 'uuml', 'yacute','thorn', 'yuml']
550
551 - def process_document(self, session, doc):
552 txt = doc.get_raw() 553 # Fix some common mistakes 554 for (fromEnt, toEnt) in self.inane.items(): 555 txt = txt.replace("&%s;" % fromEnt, toEnt) 556 for (fromEnt, toEnt) in self.preEntities.items(): 557 txt = txt.replace("&%s;" % fromEnt, "&%s;" % toEnt) 558 for s in range(len(self.entities)): 559 txt = txt.replace("&%s;" % self.entities[s] , "&#%s;" % (160 +s)) 560 for (fent, totxt) in self.otherEntities.items(): 561 txt = txt.replace("&%s;" % fent, "&%s;" % totxt) 562 563 # Add missing # in &123;
564 - def hashed(mo): return '&#%s;' % mo.group(1)
565 txt = self.numericalEntRe.sub(hashed, txt) 566 # Fraction entities. (?)
567 - def fraction(mo): return '%s&#8260;%s' % (mo.group(1), mo.group(2))
568 txt = self.fractionRe.sub(fraction, txt) 569 570 # kill invalid character entities 571 txt = self.invalidRe.sub('', txt) 572 573 return StringDocument(txt, self.id, doc.processHistory, doc.parent)
574