1    package com.instantbank.collections.util;
2    
3    
4    /**
5     * A Class class.
6     * <P>
7     *
8     * @author Guillermo Posse
9     */
10   public class LexicalAnalyzer extends Object {
11     public final int TINVALID = -1;
12     public final int TEND = 0;
13     public final int TLESSTHAN = 1;  // <
14     public final int TGREATERTHAN = 2;  // >
15     public final int TQUESTION = 3;  // ?
16     public final int TSLASH = 4;  // /
17     public final int TEQUAL = 5;  // /
18     public final int TENDTAG = 6;
19     public final int TSTARTHEADER = 7;
20     public final int TNAME = 8;
21     public final int TTEXT = 9;
22     public final int TVALUE = 10;
23   
24     private final String tokens[] = {"<", ">", "?", "/", "=", "</", "<?", "Name", "Text", "Value"};
25   
26     private final String letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
27     private final String spaces = "\n \t\r";
28     private final String separators = spaces + "=/<>\"'?";
29     private final String numbers = "0123456789";
30   
31     private int currentPosition;
32     private String document;
33     private int length = 0;
34     private String value;
35   
36   
37     /**
38      * Constructor
39      *
40      * @param document Description of the Parameter
41      */
42     public LexicalAnalyzer(String document) {
43       this.document = document;
44       length = document.length();
45     }
46   
47   
48     private char getChar() {
49       if(currentPosition >= length) {
50         return 0;
51       }
52       return document.charAt(currentPosition);
53     }
54   
55   
56     public int getToken() {
57       char c;
58       char start;
59   
60       ignoreSpaces();
61       c = getChar();
62       if(c == 0) {
63         return TEND;
64       }
65       if(c == '<') {
66         if(document.charAt(currentPosition + 1) == '/') {
67           currentPosition += 2;
68           return TENDTAG;
69         }
70         if(document.charAt(currentPosition + 1) == '?') {
71           currentPosition += 2;
72           return TSTARTHEADER;
73         }
74         currentPosition++;
75         return TLESSTHAN;
76       }
77       if(c == '>') {
78         currentPosition++;
79         return TGREATERTHAN;
80       }
81       if(c == '?') {
82         currentPosition++;
83         return TQUESTION;
84       }
85       if(c == '/') {
86         currentPosition++;
87         return TSLASH;
88       }
89       if(c == '=') {
90         currentPosition++;
91         return TEQUAL;
92       }
93       if(c == '\'' || c == '\"') {
94         value = "";
95         start = c;
96         while(true) {
97           currentPosition++;
98           c = getChar();
99           if(c == 0) {
100            return TEND;
101          }
102          if(c == start) {
103            currentPosition++;
104            break;
105          }
106          value += c;
107        }
108        return TVALUE;
109      }
110      if(getTokenName(c)) {
111        return TNAME;
112      }
113      return TINVALID;
114    }
115  
116  
117    public int getTokenElement() {
118      char c;
119      char start;
120  
121      ignoreSpaces();
122      c = getChar();
123      if(c == 0) {
124        return TEND;
125      }
126      if(c == '<') {
127        if(document.charAt(currentPosition + 1) == '/') {
128          currentPosition += 2;
129          return TENDTAG;
130        }
131        currentPosition++;
132        return TLESSTHAN;
133      }
134      if(getTokenText(c)) {
135        return TTEXT;
136      }
137      return TINVALID;
138    }
139  
140  
141    private boolean getTokenName(char par) {
142      char c;
143  
144      c = par;
145      if(letters.indexOf(c) >= 0) {
146        value = "";
147        while(true) {
148          value += c;
149          currentPosition++;
150          c = getChar();
151          if(separators.indexOf(c) >= 0) {
152            break;
153          }
154        }
155        return true;
156      }
157      else {
158        return false;
159      }
160    }
161  
162  
163    private boolean getTokenText(char par) {
164      char c;
165  
166      c = par;
167      value = "";
168      while(true) {
169        if(c == '<') {
170          break;
171        }
172        if(c == 0) {
173          break;
174        }
175        value += c;
176        currentPosition++;
177        c = getChar();
178      }
179      return true;
180    }
181  
182  
183    public String getTokenString(int t) {
184      return tokens[t - 1];
185    }
186  
187  
188    public String getValue() {
189      return value;
190    }
191  
192  
193    private void ignoreSpaces() {
194      char c;
195  
196      while(true) {
197        c = getChar();
198        if(c == 0) {
199          break;
200        }
201        if(spaces.indexOf(c) == -1) {
202          break;
203        }
204        currentPosition++;
205      }
206    }
207  }
208  
209