1
2
3
4 """simple parser / string tokenizer
5 rather than returning a list of token types etc, we simple return a list of tokens...
6 each tokenizing function takes a string as input and returns a list of tokens
7 """
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
28 """takes away repeated quotes (escapes) and returns the string represented by the text"""
29 stringchar = text[0]
30 if text[-1] != stringchar or stringchar not in ("'", '"'):
31
32 raise ValueError, "error parsing escaped string: %r" % text
33 return text[1:-1].replace(stringchar+stringchar, stringchar)
34
36 """escapes quotes as neccessary and returns a string representing the text"""
37 if "'" in text:
38 if '"' in text:
39 return '"' + text.replace('"', '""') + '"'
40 else:
41 return '"' + text + '"'
42 else:
43 return "'" + text + "'"
44
46 """Intelligent parser error"""
47 - def __init__(self, parser, message, tokennum):
48 """takes a message and the number of the token that caused the error"""
49 tokenpos = parser.findtokenpos(tokennum)
50 line, charpos = parser.getlinepos(tokenpos)
51 ValueError.__init__(self, "%s at line %d, char %d (token %r)" % \
52 (message, line, charpos, parser.tokens[tokennum]))
53 self.parser = parser
54 self.tokennum = tokennum
55
57 """this is a simple parser"""
58 - def __init__(self, defaulttokenlist=None, whitespacechars=" \t\r\n", includewhitespacetokens=0):
59 if defaulttokenlist is None:
60 self.defaulttokenlist = ['<=', '>=', '==', '!=', '+=', '-=', '*=', '/=', '<>']
61 self.defaulttokenlist.extend('(),[]:=+-')
62 else:
63 self.defaulttokenlist = defaulttokenlist
64 self.whitespacechars = whitespacechars
65 self.includewhitespacetokens = includewhitespacetokens
66 self.standardtokenizers = [self.stringtokenize, self.removewhitespace, self.separatetokens]
67 self.quotechars = ('"', "'")
68 self.endquotechars = {'"':'"', "'":"'"}
69 self.stringescaping = 1
70
72 """makes strings in text into tokens..."""
73 tokens = []
74 laststart = 0
75 instring = 0
76 endstringchar, escapechar = '', '\\'
77 gotclose, gotescape = 0, 0
78 for pos in range(len(text)):
79 char = text[pos]
80 if instring:
81 if self.stringescaping and (gotescape or char == escapechar) and not gotclose:
82 gotescape = not gotescape
83 elif char == endstringchar:
84 gotclose = not gotclose
85 elif gotclose:
86 tokens.append(text[laststart:pos])
87 instring, laststart, endstringchar = 0, pos, ''
88 if not instring:
89 if char in self.quotechars:
90 if pos > laststart:
91 tokens.append(text[laststart:pos])
92 instring, laststart, endstringchar, gotclose = 1, pos, self.endquotechars[char], 0
93 if laststart < len(text):
94 tokens.append(text[laststart:])
95 return tokens
96
98 """checks whether a token should be kept together"""
99 return self.isstringtoken(text)
100
102 """checks whether a token is a string token"""
103 return text[:1] in self.quotechars
104
106 """this separates out tokens in tokenlist from whitespace etc"""
107 if self.keeptogether(text):
108 return [text]
109 if tokenlist is None:
110 tokenlist = self.defaulttokenlist
111
112 tokens = []
113 pos = 0
114 laststart = 0
115 lentext = len(text)
116 while pos < lentext:
117 foundtoken = 0
118 for token in tokenlist:
119 lentoken = len(token)
120 if text[pos:pos+lentoken] == token:
121 if laststart < pos:
122 tokens.append(text[laststart:pos])
123 tokens.append(token)
124 pos += lentoken
125 foundtoken, laststart = 1, pos
126 break
127 if not foundtoken:
128 pos += 1
129 if laststart < lentext:
130 tokens.append(text[laststart:])
131 return tokens
132
134 """this removes whitespace but lets it separate things out into separate tokens"""
135 if self.keeptogether(text):
136 return [text]
137
138 tokens = []
139 pos = 0
140 inwhitespace = 0
141 laststart = 0
142 for pos in range(len(text)):
143 char = text[pos]
144 if inwhitespace:
145 if char not in self.whitespacechars:
146 if laststart < pos and self.includewhitespacetokens:
147 tokens.append(text[laststart:pos])
148 inwhitespace, laststart = 0, pos
149 else:
150 if char in self.whitespacechars:
151 if laststart < pos:
152 tokens.append(text[laststart:pos])
153 inwhitespace, laststart = 1, pos
154 if laststart < len(text) and (not inwhitespace or self.includewhitespacetokens):
155 tokens.append(text[laststart:])
156 return tokens
157
159 """apply a tokenizer to a set of text, flattening the result"""
160 tokenizedlists = [tokenizer(text) for text in inputlist]
161 joined = []
162 map(joined.extend, tokenizedlists)
163 return joined
164
166 """apply a set of tokenizers to a set of text, flattening each time"""
167 for tokenizer in tokenizers:
168 inputlist = self.applytokenizer(inputlist, tokenizer)
169 return inputlist
170
171 - def tokenize(self, source, tokenizers=None):
172 """tokenize the text string with the standard tokenizers"""
173 self.source = source
174 if tokenizers is None:
175 tokenizers = self.standardtokenizers
176 self.tokens = self.applytokenizers([self.source], tokenizers)
177 return self.tokens
178
180 """finds the position of the given token in the text"""
181 currenttokenpos = 0
182 for currenttokennum in range(tokennum+1):
183 currenttokenpos = self.source.find(self.tokens[currenttokennum], currenttokenpos)
184 return currenttokenpos
185
187 """finds the line and character position of the given character"""
188 sourcecut = self.source[:tokenpos]
189 line = sourcecut.count("\n")+1
190 charpos = tokenpos - sourcecut.rfind("\n")
191 return line, charpos
192
194 """raises a ParserError"""
195 raise ParserError(self, message, tokennum)
196