1
2
3
4
5
6 package net.sourceforge.pmd.cpd;
7
8 import java.util.List;
9
10 public abstract class AbstractTokenizer implements Tokenizer {
11
12
13
14 protected List<String> stringToken;
15 protected List<String> ignorableCharacter;
16
17 protected List<String> ignorableStmt;
18 protected char oneLineCommentChar = '#';
19
20 private List<String> code;
21 private int lineNumber = 0;
22 private String currentLine;
23
24 protected boolean spanMultipleLinesString = true;
25
26 private boolean downcaseString = true;
27
28 public void tokenize(SourceCode tokens, Tokens tokenEntries) {
29 code = tokens.getCode();
30
31 for ( lineNumber = 0; lineNumber < code.size(); lineNumber++ ) {
32 currentLine = code.get(lineNumber);
33 int loc = 0;
34 while ( loc < currentLine.length() ) {
35 StringBuilder token = new StringBuilder();
36 loc = getTokenFromLine(token,loc);
37 if (token.length() > 0 && !isIgnorableString(token.toString())) {
38 if (downcaseString) {
39 token = new StringBuilder(token.toString().toLowerCase());
40 }
41
42
43
44
45 tokenEntries.add(new TokenEntry(token.toString(),
46 tokens.getFileName(),
47 lineNumber)
48 );
49
50 }
51 }
52 }
53 tokenEntries.add(TokenEntry.getEOF());
54 }
55
56 private int getTokenFromLine(StringBuilder token, int loc) {
57 for (int j = loc; j < currentLine.length(); j++) {
58 char tok = currentLine.charAt(j);
59 if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {
60 if (isComment(tok)) {
61 if (token.length() > 0) {
62 return j;
63 } else {
64 return getCommentToken(token, loc);
65 }
66 } else if (isString(tok)) {
67 if (token.length() > 0) {
68 return j;
69 } else {
70
71 return parseString(token, j, tok);
72 }
73 } else {
74 token.append(tok);
75 }
76 } else {
77 if (token.length() > 0) {
78 return j;
79 }
80 }
81 loc = j;
82 }
83 return loc + 1;
84 }
85
86 private int parseString(StringBuilder token, int loc, char stringDelimiter) {
87 boolean escaped = false;
88 boolean done = false;
89 char tok = ' ';
90 while ((loc < currentLine.length()) && ! done) {
91 tok = currentLine.charAt(loc);
92 if (escaped && tok == stringDelimiter) {
93 escaped = false;
94 } else if (tok == stringDelimiter && (token.length() > 0)) {
95 done = true;
96 } else if (tok == '\\') {
97 escaped = true;
98 } else {
99 escaped = false;
100 }
101
102 token.append(tok);
103 loc++;
104 }
105
106 if ( ! done &&
107 loc >= currentLine.length() &&
108 spanMultipleLinesString &&
109 lineNumber < code.size() - 1
110 ) {
111
112 currentLine = code.get(++lineNumber);
113
114 loc = parseString(token, loc, stringDelimiter);
115 }
116 return loc + 1;
117 }
118
119 private boolean ignoreCharacter(char tok)
120 {
121 return ignorableCharacter.contains(String.valueOf(tok));
122 }
123
124 private boolean isString(char tok)
125 {
126 return stringToken.contains(String.valueOf(tok));
127 }
128
129 private boolean isComment(char tok)
130 {
131 return tok == oneLineCommentChar;
132 }
133
134 private int getCommentToken(StringBuilder token, int loc)
135 {
136 while (loc < currentLine.length())
137 {
138 token.append(currentLine.charAt(loc++));
139 }
140 return loc;
141 }
142
143 private boolean isIgnorableString(String token)
144 {
145 return ignorableStmt.contains(token);
146 }
147 }