KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > demo > IndexHTML


1 package org.apache.lucene.demo;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import org.apache.lucene.analysis.standard.StandardAnalyzer;
20 import org.apache.lucene.document.Document;
21 import org.apache.lucene.index.IndexReader;
22 import org.apache.lucene.index.IndexWriter;
23 import org.apache.lucene.index.Term;
24 import org.apache.lucene.index.TermEnum;
25 import java.io.File JavaDoc;
26 import java.util.Date JavaDoc;
27 import java.util.Arrays JavaDoc;
28
29 class IndexHTML {
30   private static boolean deleting = false; // true during deletion pass
31
private static IndexReader reader; // existing index
32
private static IndexWriter writer; // new index being built
33
private static TermEnum uidIter; // document id iterator
34

35   public static void main(String JavaDoc[] argv) {
36     try {
37       String JavaDoc index = "index";
38       boolean create = false;
39       File JavaDoc root = null;
40
41       String JavaDoc usage = "IndexHTML [-create] [-index <index>] <root_directory>";
42
43       if (argv.length == 0) {
44     System.err.println("Usage: " + usage);
45     return;
46       }
47
48       for (int i = 0; i < argv.length; i++) {
49     if (argv[i].equals("-index")) { // parse -index option
50
index = argv[++i];
51     } else if (argv[i].equals("-create")) { // parse -create option
52
create = true;
53     } else if (i != argv.length-1) {
54       System.err.println("Usage: " + usage);
55       return;
56     } else
57       root = new File JavaDoc(argv[i]);
58       }
59
60       Date JavaDoc start = new Date JavaDoc();
61
62       if (!create) { // delete stale docs
63
deleting = true;
64     indexDocs(root, index, create);
65       }
66
67       writer = new IndexWriter(index, new StandardAnalyzer(), create);
68       writer.maxFieldLength = 1000000;
69
70       indexDocs(root, index, create); // add new docs
71

72       System.out.println("Optimizing index...");
73       writer.optimize();
74       writer.close();
75
76       Date JavaDoc end = new Date JavaDoc();
77
78       System.out.print(end.getTime() - start.getTime());
79       System.out.println(" total milliseconds");
80
81     } catch (Exception JavaDoc e) {
82       System.out.println(" caught a " + e.getClass() +
83              "\n with message: " + e.getMessage());
84     }
85   }
86
87   /* Walk directory hierarchy in uid order, while keeping uid iterator from
88   /* existing index in sync. Mismatches indicate one of: (a) old documents to
89   /* be deleted; (b) unchanged documents, to be left alone; or (c) new
90   /* documents, to be indexed.
91    */

92
93   private static void indexDocs(File JavaDoc file, String JavaDoc index, boolean create)
94        throws Exception JavaDoc {
95     if (!create) { // incrementally update
96

97       reader = IndexReader.open(index); // open existing index
98
uidIter = reader.terms(new Term("uid", "")); // init uid iterator
99

100       indexDocs(file);
101
102       if (deleting) { // delete rest of stale docs
103
while (uidIter.term() != null && uidIter.term().field() == "uid") {
104       System.out.println("deleting " +
105                  HTMLDocument.uid2url(uidIter.term().text()));
106       reader.delete(uidIter.term());
107       uidIter.next();
108     }
109     deleting = false;
110       }
111
112       uidIter.close(); // close uid iterator
113
reader.close(); // close existing index
114

115     } else // don't have exisiting
116
indexDocs(file);
117   }
118
119   private static void indexDocs(File JavaDoc file) throws Exception JavaDoc {
120     if (file.isDirectory()) { // if a directory
121
String JavaDoc[] files = file.list(); // list its files
122
Arrays.sort(files); // sort the files
123
for (int i = 0; i < files.length; i++) // recursively index them
124
indexDocs(new File JavaDoc(file, files[i]));
125
126     } else if (file.getPath().endsWith(".html") || // index .html files
127
file.getPath().endsWith(".htm") || // index .htm files
128
file.getPath().endsWith(".txt")) { // index .txt files
129

130       if (uidIter != null) {
131     String JavaDoc uid = HTMLDocument.uid(file); // construct uid for doc
132

133     while (uidIter.term() != null && uidIter.term().field() == "uid" &&
134            uidIter.term().text().compareTo(uid) < 0) {
135       if (deleting) { // delete stale docs
136
System.out.println("deleting " +
137                    HTMLDocument.uid2url(uidIter.term().text()));
138         reader.delete(uidIter.term());
139       }
140       uidIter.next();
141     }
142     if (uidIter.term() != null && uidIter.term().field() == "uid" &&
143         uidIter.term().text().compareTo(uid) == 0) {
144       uidIter.next(); // keep matching docs
145
} else if (!deleting) { // add new docs
146
Document doc = HTMLDocument.Document(file);
147       System.out.println("adding " + doc.get("url"));
148     writer.addDocument(doc);
149     }
150       } else { // creating a new index
151
Document doc = HTMLDocument.Document(file);
152     System.out.println("adding " + doc.get("url"));
153     writer.addDocument(doc); // add docs unconditionally
154
}
155     }
156   }
157 }
158
Popular Tags