1 2 3 4 package org.creativecommons.nutch; 5 6 import net.nutch.io.*; 7 import net.nutch.util.LogFormatter; 8 import net.nutch.indexer.IndexSegment; 9 10 import org.apache.lucene.index.IndexReader; 11 import org.apache.lucene.document.Document; 12 13 import java.io.*; 14 import java.util.Vector ; 15 import java.util.logging.Logger ; 16 17 19 public class CCDeleteUnlicensedTool { 20 private static final Logger LOG = 21 LogFormatter.getLogger("org.creativecommons.nutch.CCDeleteUnlicensedTool"); 22 23 private IndexReader[] readers; 24 25 26 public CCDeleteUnlicensedTool(IndexReader[] readers) { 27 this.readers = readers; 28 } 29 30 31 public void close() throws IOException { 32 for (int i = 0; i < readers.length; i++) 33 readers[i].close(); 34 } 35 36 37 public int deleteUnlicensed() throws IOException { 38 int deleteCount = 0; 39 for (int index = 0; index < readers.length; index++) { 40 IndexReader reader = readers[index]; 41 int readerMax = reader.maxDoc(); 42 for (int doc = 0; doc < readerMax; doc++) { 43 if (!reader.isDeleted(doc)) { 44 Document document = reader.document(doc); 45 if (document.get(CCIndexingFilter.FIELD)==null){ reader.delete(doc); deleteCount++; 48 } 49 } 50 } 51 } 52 return deleteCount; 53 } 54 55 56 public static void main(String [] args) throws Exception { 57 String usage = "CCDeleteUnlicensedTool <segmentsDir>"; 58 59 if (args.length != 1) { 60 System.err.println("Usage: " + usage); 61 return; 62 } 63 64 String segmentsDir = args[0]; 65 66 File[] directories = new File(segmentsDir).listFiles(); 67 Vector vReaders=new Vector (); 68 int maxDoc = 0; 69 for (int i = 0; i < directories.length; i++) { 70 File indexDone = new File(directories[i], IndexSegment.DONE_NAME); 71 if (indexDone.exists() && indexDone.isFile()){ 72 File indexDir = new File(directories[i], "index"); 73 IndexReader reader = IndexReader.open(indexDir); 74 maxDoc += reader.maxDoc(); 75 vReaders.add(reader); 76 } 77 } 78 79 IndexReader[] readers=new IndexReader[vReaders.size()]; 80 for(int i = 0; vReaders.size()>0; i++) { 81 readers[i]=(IndexReader)vReaders.remove(0); 82 } 83 84 CCDeleteUnlicensedTool dd = new CCDeleteUnlicensedTool(readers); 85 int count = dd.deleteUnlicensed(); 86 LOG.info("CC: deleted "+count+" out of "+maxDoc); 87 dd.close(); 88 } 89 } 90 | Popular Tags |