KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > ru > RussianAnalyzer


1 package org.apache.lucene.analysis.ru;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import org.apache.lucene.analysis.Analyzer;
20 import org.apache.lucene.analysis.StopFilter;
21 import org.apache.lucene.analysis.TokenStream;
22
23 import java.io.Reader JavaDoc;
24 import java.util.Hashtable JavaDoc;
25 import java.util.Set JavaDoc;
26 import java.util.HashSet JavaDoc;
27
28 /**
29  * Analyzer for Russian language. Supports an external list of stopwords (words that
30  * will not be indexed at all).
31  * A default set of stopwords is used unless an alternative list is specified.
32  *
33  * @author Boris Okner, b.okner@rogers.com
34  * @version $Id: RussianAnalyzer.java 150998 2004-08-16 20:30:46Z dnaber $
35  */

36 public final class RussianAnalyzer extends Analyzer
37 {
38     // letters (currently unused letters are commented out)
39
private final static char A = 0;
40     private final static char B = 1;
41     private final static char V = 2;
42     private final static char G = 3;
43     private final static char D = 4;
44     private final static char E = 5;
45     private final static char ZH = 6;
46     private final static char Z = 7;
47     private final static char I = 8;
48     private final static char I_ = 9;
49     private final static char K = 10;
50     private final static char L = 11;
51     private final static char M = 12;
52     private final static char N = 13;
53     private final static char O = 14;
54     private final static char P = 15;
55     private final static char R = 16;
56     private final static char S = 17;
57     private final static char T = 18;
58     private final static char U = 19;
59     //private final static char F = 20;
60
private final static char X = 21;
61     //private final static char TS = 22;
62
private final static char CH = 23;
63     private final static char SH = 24;
64     private final static char SHCH = 25;
65     //private final static char HARD = 26;
66
private final static char Y = 27;
67     private final static char SOFT = 28;
68     private final static char AE = 29;
69     private final static char IU = 30;
70     private final static char IA = 31;
71
72     /**
73      * List of typical Russian stopwords.
74      */

75     private static char[][] RUSSIAN_STOP_WORDS = {
76         {A},
77         {B, E, Z},
78         {B, O, L, E, E},
79         {B, Y},
80         {B, Y, L},
81         {B, Y, L, A},
82         {B, Y, L, I},
83         {B, Y, L, O},
84         {B, Y, T, SOFT},
85         {V},
86         {V, A, M},
87         {V, A, S},
88         {V, E, S, SOFT},
89         {V, O},
90         {V, O, T},
91         {V, S, E},
92         {V, S, E, G, O},
93         {V, S, E, X},
94         {V, Y},
95         {G, D, E},
96         {D, A},
97         {D, A, ZH, E},
98         {D, L, IA},
99         {D, O},
100         {E, G, O},
101         {E, E},
102         {E, I_,},
103         {E, IU},
104         {E, S, L, I},
105         {E, S, T, SOFT},
106         {E, SHCH, E},
107         {ZH, E},
108         {Z, A},
109         {Z, D, E, S, SOFT},
110         {I},
111         {I, Z},
112         {I, L, I},
113         {I, M},
114         {I, X},
115         {K},
116         {K, A, K},
117         {K, O},
118         {K, O, G, D, A},
119         {K, T, O},
120         {L, I},
121         {L, I, B, O},
122         {M, N, E},
123         {M, O, ZH, E, T},
124         {M, Y},
125         {N, A},
126         {N, A, D, O},
127         {N, A, SH},
128         {N, E},
129         {N, E, G, O},
130         {N, E, E},
131         {N, E, T},
132         {N, I},
133         {N, I, X},
134         {N, O},
135         {N, U},
136         {O},
137         {O, B},
138         {O, D, N, A, K, O},
139         {O, N},
140         {O, N, A},
141         {O, N, I},
142         {O, N, O},
143         {O, T},
144         {O, CH, E, N, SOFT},
145         {P, O},
146         {P, O, D},
147         {P, R, I},
148         {S},
149         {S, O},
150         {T, A, K},
151         {T, A, K, ZH, E},
152         {T, A, K, O, I_},
153         {T, A, M},
154         {T, E},
155         {T, E, M},
156         {T, O},
157         {T, O, G, O},
158         {T, O, ZH, E},
159         {T, O, I_},
160         {T, O, L, SOFT, K, O},
161         {T, O, M},
162         {T, Y},
163         {U},
164         {U, ZH, E},
165         {X, O, T, IA},
166         {CH, E, G, O},
167         {CH, E, I_},
168         {CH, E, M},
169         {CH, T, O},
170         {CH, T, O, B, Y},
171         {CH, SOFT, E},
172         {CH, SOFT, IA},
173         {AE, T, A},
174         {AE, T, I},
175         {AE, T, O},
176         {IA}
177     };
178
179     /**
180      * Contains the stopwords used with the StopFilter.
181      */

182     private Set JavaDoc stopSet = new HashSet JavaDoc();
183
184     /**
185      * Charset for Russian letters.
186      * Represents encoding for 32 lowercase Russian letters.
187      * Predefined charsets can be taken from RussianCharSets class
188      */

189     private char[] charset;
190
191
192     public RussianAnalyzer() {
193         charset = RussianCharsets.UnicodeRussian;
194         stopSet = StopFilter.makeStopSet(
195                     makeStopWords(RussianCharsets.UnicodeRussian));
196     }
197
198     /**
199      * Builds an analyzer.
200      */

201     public RussianAnalyzer(char[] charset)
202     {
203         this.charset = charset;
204         stopSet = StopFilter.makeStopSet(makeStopWords(charset));
205     }
206
207     /**
208      * Builds an analyzer with the given stop words.
209      */

210     public RussianAnalyzer(char[] charset, String JavaDoc[] stopwords)
211     {
212         this.charset = charset;
213         stopSet = StopFilter.makeStopSet(stopwords);
214     }
215
216     // Takes russian stop words and translates them to a String array, using
217
// the given charset
218
private static String JavaDoc[] makeStopWords(char[] charset)
219     {
220         String JavaDoc[] res = new String JavaDoc[RUSSIAN_STOP_WORDS.length];
221         for (int i = 0; i < res.length; i++)
222         {
223             char[] theStopWord = RUSSIAN_STOP_WORDS[i];
224             // translate the word, using the charset
225
StringBuffer JavaDoc theWord = new StringBuffer JavaDoc();
226             for (int j = 0; j < theStopWord.length; j++)
227             {
228                 theWord.append(charset[theStopWord[j]]);
229             }
230             res[i] = theWord.toString();
231         }
232         return res;
233     }
234
235     /**
236      * Builds an analyzer with the given stop words.
237      * @todo create a Set version of this ctor
238      */

239     public RussianAnalyzer(char[] charset, Hashtable JavaDoc stopwords)
240     {
241         this.charset = charset;
242         stopSet = new HashSet JavaDoc(stopwords.keySet());
243     }
244
245     /**
246      * Creates a TokenStream which tokenizes all the text in the provided Reader.
247      *
248      * @return A TokenStream build from a RussianLetterTokenizer filtered with
249      * RussianLowerCaseFilter, StopFilter, and RussianStemFilter
250      */

251     public TokenStream tokenStream(String JavaDoc fieldName, Reader JavaDoc reader)
252     {
253         TokenStream result = new RussianLetterTokenizer(reader, charset);
254         result = new RussianLowerCaseFilter(result, charset);
255         result = new StopFilter(result, stopSet);
256         result = new RussianStemFilter(result, charset);
257         return result;
258     }
259 }
260
Popular Tags