Package textmining :: Module tmPreParser
[hide private]
[frames] | no frames]

Source Code for Module textmining.tmPreParser

  1   
  2  from document import StringDocument 
  3  from baseObjects import PreParser 
  4  import os, re 
  5  from utils import getFirstData, elementType, verifyXPaths 
  6   
  7  from TsujiiC3 import TsujiiObject, EnjuObject, GeniaObject 
  8   
9 -class PosPreParser(PreParser):
10 """ Base class for deriving Part of Speech PreParsers """ 11 pass
12
13 -class TsujiiChunkerPreParser(PreParser):
14 # Any need for this outside of preParsing?? 15 16 inh = None 17 outh = None 18
19 - def __init__(self, session, node, parent):
20 PreParser.__init__(self, session, node, parent) 21 o = os.getcwd() 22 tp = self.get_path(session, 'chunkerPath') 23 if tp: 24 os.chdir(tp) 25 else: 26 os.chdir('../../code/tsujii') 27 (a,b) = os.popen2('./parser') 28 self.inh = a 29 self.outh = b 30 os.chdir(o)
31 32
33 - def process_document(self, session, doc):
34 # Must be raw text after passed through tagger 35 txt = doc.get_raw() 36 lines = txt.split('\n') 37 all = [] 38 for l in lines: 39 self.inh.write(l) 40 self.inh.write("\n") 41 self.inh.flush() 42 tagd = self.outh.readline() 43 all.append(tagd) 44 return StringDocument('\n'.join(all))
45 46
47 -class TsujiiXMLPosPreParser(PosPreParser, TsujiiObject):
48
49 - def __init__(self, session, node, parent):
52
53 - def process_document(self, session, doc):
54 text = doc.get_raw() 55 tt = self.tag(session, text, xml=1) 56 ttj = '\n'.join(tt) 57 ttj = "<text>" + ttj + "</text>" 58 return StringDocument(ttj, self.id, doc.processHistory, 'text/xml', doc.parent)
59
60 -class TsujiiTextPosPreParser(PosPreParser, TsujiiObject):
61
62 - def __init__(self, session, node, parent):
65
66 - def process_document(self, session, doc):
67 text = doc.get_raw() 68 tt = self.tag(session, text, xml=0) 69 tt = '\n'.join(tt) 70 return StringDocument(tt, self.id, doc.processHistory, 'text/plain', doc.parent)
71
72 -class EnjuTextPreParser(PosPreParser, EnjuObject):
73 - def __init__(self, session, node, parent):
74 PosPreParser.__init__(self, session, node, parent) 75 EnjuObject.__init__(self, session, node, parent)
76
77 - def process_document(self, session, doc):
78 text = doc.get_raw() 79 tt = self.tag(session, text) 80 tt= '\n'.join(tt) 81 return StringDocument("<text>%s</text>" % tt)
82 83
84 -class GeniaTextPreParser(PreParser):
85 """ Take the full output from Genia and reconstruct the document, maybe with stems ('useStem') and/or PoS tags ('pos') """ 86
87 - def __init__(self, session, config, parent):
88 PreParser.__init__(self, session, config, parent) 89 self.stem = self.get_setting(session, 'useStem', 0) 90 self.pos = self.get_setting(session, 'pos', 0) 91 self.puncre = re.compile('[ ]([.,;:?!][ \n])')
92
93 - def process_document(session, doc):
94 data = doc.get_raw() 95 lines = data.split('\n') 96 words = [] 97 for l in lines: 98 if l == '\n': 99 words.append(l) 100 else: 101 (word, stem, pos, other) = l[:-1].split('\t') 102 if self.stem: 103 w = stem 104 else: 105 w = word 106 if self.pos: 107 w = "%s/%s" % (w, pos) 108 words.append(w) 109 txt = ' '.join(words) 110 txt = self.puncRe.sub('\\1', txt) 111 return StringDocument(txt)
112