KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > de > GermanAnalyzer


1 package org.apache.lucene.analysis.de;
2 // This file is encoded in UTF-8
3

4 /**
5  * Copyright 2004 The Apache Software Foundation
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */

19
20 import org.apache.lucene.analysis.Analyzer;
21 import org.apache.lucene.analysis.LowerCaseFilter;
22 import org.apache.lucene.analysis.StopFilter;
23 import org.apache.lucene.analysis.TokenStream;
24 import org.apache.lucene.analysis.WordlistLoader;
25 import org.apache.lucene.analysis.standard.StandardFilter;
26 import org.apache.lucene.analysis.standard.StandardTokenizer;
27
28 import java.io.File JavaDoc;
29 import java.io.IOException JavaDoc;
30 import java.io.Reader JavaDoc;
31 import java.util.HashSet JavaDoc;
32 import java.util.Hashtable JavaDoc;
33 import java.util.Set JavaDoc;
34
35 /**
36  * Analyzer for German language. Supports an external list of stopwords (words that
37  * will not be indexed at all) and an external list of exclusions (word that will
38  * not be stemmed, but indexed).
39  * A default set of stopwords is used unless an alternative list is specified, the
40  * exclusion list is empty by default.
41  *
42  * @author Gerhard Schwarz
43  * @version $Id: GermanAnalyzer.java 151017 2004-11-29 22:22:48Z dnaber $
44  */

45 public class GermanAnalyzer extends Analyzer {
46   
47   /**
48    * List of typical german stopwords.
49    */

50   public final static String JavaDoc[] GERMAN_STOP_WORDS = {
51     "einer", "eine", "eines", "einem", "einen",
52     "der", "die", "das", "dass", "daß",
53     "du", "er", "sie", "es",
54     "was", "wer", "wie", "wir",
55     "und", "oder", "ohne", "mit",
56     "am", "im", "in", "aus", "auf",
57     "ist", "sein", "war", "wird",
58     "ihr", "ihre", "ihres",
59     "als", "für", "von", "mit",
60     "dich", "dir", "mich", "mir",
61     "mein", "sein", "kein",
62     "durch", "wegen", "wird"
63   };
64
65   /**
66    * Contains the stopwords used with the StopFilter.
67    */

68   private Set JavaDoc stopSet = new HashSet JavaDoc();
69
70   /**
71    * Contains words that should be indexed but not stemmed.
72    */

73   private Set JavaDoc exclusionSet = new HashSet JavaDoc();
74
75   /**
76    * Builds an analyzer with the default stop words
77    * (<code>GERMAN_STOP_WORDS</code>).
78    */

79   public GermanAnalyzer() {
80     stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
81   }
82
83   /**
84    * Builds an analyzer with the given stop words.
85    */

86   public GermanAnalyzer(String JavaDoc[] stopwords) {
87     stopSet = StopFilter.makeStopSet(stopwords);
88   }
89
90   /**
91    * Builds an analyzer with the given stop words.
92    */

93   public GermanAnalyzer(Hashtable JavaDoc stopwords) {
94     stopSet = new HashSet JavaDoc(stopwords.keySet());
95   }
96
97   /**
98    * Builds an analyzer with the given stop words.
99    */

100   public GermanAnalyzer(File JavaDoc stopwords) throws IOException JavaDoc {
101     stopSet = WordlistLoader.getWordSet(stopwords);
102   }
103
104   /**
105    * Builds an exclusionlist from an array of Strings.
106    */

107   public void setStemExclusionTable(String JavaDoc[] exclusionlist) {
108     exclusionSet = StopFilter.makeStopSet(exclusionlist);
109   }
110
111   /**
112    * Builds an exclusionlist from a Hashtable.
113    */

114   public void setStemExclusionTable(Hashtable JavaDoc exclusionlist) {
115     exclusionSet = new HashSet JavaDoc(exclusionlist.keySet());
116   }
117
118   /**
119    * Builds an exclusionlist from the words contained in the given file.
120    */

121   public void setStemExclusionTable(File JavaDoc exclusionlist) throws IOException JavaDoc {
122     exclusionSet = WordlistLoader.getWordSet(exclusionlist);
123   }
124
125   /**
126    * Creates a TokenStream which tokenizes all the text in the provided Reader.
127    *
128    * @return A TokenStream build from a StandardTokenizer filtered with
129    * StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
130    */

131   public TokenStream tokenStream(String JavaDoc fieldName, Reader JavaDoc reader) {
132     TokenStream result = new StandardTokenizer(reader);
133     result = new StandardFilter(result);
134     result = new LowerCaseFilter(result);
135     result = new StopFilter(result, stopSet);
136     result = new GermanStemFilter(result, exclusionSet);
137     return result;
138   }
139 }
140
Popular Tags