From 544a38dd45f6a58d34296c6c768afd086eb2ac70 Mon Sep 17 00:00:00 2001 From: Christopher Lenz Date: Fri, 28 Mar 2008 23:32:19 +0000 Subject: Imported trunk. git-svn-id: https://svn.apache.org/repos/asf/incubator/couchdb/trunk@642432 13f79535-47bb-0310-9956-ffa450edef68 --- src/fulltext/lucene/CouchConfig.java | 62 ++++++ src/fulltext/lucene/CouchDbDirFilter.java | 30 +++ src/fulltext/lucene/LuceneIndexer.java | 355 ++++++++++++++++++++++++++++++ src/fulltext/lucene/LuceneSearcher.java | 90 ++++++++ src/fulltext/lucene/readme.txt | 41 ++++ 5 files changed, 578 insertions(+) create mode 100644 src/fulltext/lucene/CouchConfig.java create mode 100644 src/fulltext/lucene/CouchDbDirFilter.java create mode 100644 src/fulltext/lucene/LuceneIndexer.java create mode 100644 src/fulltext/lucene/LuceneSearcher.java create mode 100644 src/fulltext/lucene/readme.txt (limited to 'src/fulltext/lucene') diff --git a/src/fulltext/lucene/CouchConfig.java b/src/fulltext/lucene/CouchConfig.java new file mode 100644 index 00000000..5f4d84ce --- /dev/null +++ b/src/fulltext/lucene/CouchConfig.java @@ -0,0 +1,62 @@ +/* + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. + +*/ + +import java.util.*; + + +class CouchConfig +{ +/* private CouchDocument[] documents; +*/ + private Hashtable documents; + private long updateSequence; + + public CouchConfig() + { + documents = new Hashtable(); + updateSequence = 0; + } + + public void setUpdateSequence(long newUpdateSequence) + { + updateSequence = newUpdateSequence; + } + + public long getUpdateSequence() + { + return updateSequence; + } + + public void addDocument(com.fourspaces.couchdb.Document document) + { + String field; +// System.out.println(document); + field = document.getString("__couchdb_database"); +// System.out.println(field); + if(field != null) { + documents.put(field, document); + } + } + + public Hashtable getDocuments() + { + return documents; + } + + public boolean hasDb(String db) + { + return documents.containsKey(db); + } +} diff --git a/src/fulltext/lucene/CouchDbDirFilter.java b/src/fulltext/lucene/CouchDbDirFilter.java new file mode 100644 index 00000000..6b002ce5 --- /dev/null +++ b/src/fulltext/lucene/CouchDbDirFilter.java @@ -0,0 +1,30 @@ +/* + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. + +*/ + +/* + +LuceneIndexer creates a lucene index by intrementally fetching changes from a a +Apache CouchDB server. It is managed by the Apache CouchDB daemon. + +*/ +import java.io.*; + +class CouchDbDirFilter implements FilenameFilter +{ + public boolean accept(File dir, String name) + { + return new File(dir, name).isFile(); + } +} diff --git a/src/fulltext/lucene/LuceneIndexer.java b/src/fulltext/lucene/LuceneIndexer.java new file mode 100644 index 00000000..07040610 --- /dev/null +++ b/src/fulltext/lucene/LuceneIndexer.java @@ -0,0 +1,355 @@ +/* + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. + +*/ + +/* + +LuceneIndexer creates a lucene index by incrementally fetching changes from a a +Apache CouchDB server. It is managed by the Apache CouchDB daemon. + +I know this is Java and there should be a lot of OO going on, but it +isn't. Sorry about that. + +*/ + +//basics +import java.io.*; +import java.net.*; +import java.util.*; +import java.nio.channels.FileChannel; +import java.nio.ByteBuffer; +import java.lang.reflect.*; + + +//couchdb4j +//import com.fourspaces.couchdb.*; + +//xml +import org.xml.sax.*; +import org.xml.sax.helpers.*; +import javax.xml.parsers.*; + +//lucene +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.SimpleAnalyzer; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.TermQuery; + +public class LuceneIndexer +{ + private static CouchConfig configuration; + private static com.fourspaces.couchdb.Session s; + + public static void main(String[] args) throws Exception + { +/* BufferedWriter out = new BufferedWriter(new FileWriter("LuceneIndexer.log")); + out.write("indexer started");out.flush(); +*/ + String db; +/* out.write("indexer about to read config");out.flush();*/ + connect(); + readConfig(); + +/* out.write("indexer read config: " + configuration.getDocuments());out.flush();*/ + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + try { + while((db = in.readLine()) != null) { +/* out.write("indexer got a poke");out.flush();*/ + + if(db.equals("couchdbfulltext")) { +/* System.out.println("refresh config"); + +*/ readConfig(); +/* out.write("indexer refreshed config");out.flush();*/ + + } + +/* out.write("indexer has table: " + db + "?");*/ + + if(!configuration.hasDb(db)) { +/* out.write("... no wait for input");out.flush();*/ + + continue; + } + +/* out.write("yeppa");out.flush();*/ + + + createIndexDir(db); + indexChanges(db); +/* System.out.println(db + " to revision: " + revision);*/ + } + } catch (IOException e) { +/* out.write("indexer caught IO exception: " + e.getMessage());out.flush();*/ + + } +/* System.out.println("Lucene Indexer stopped");*/ +/* out.write("indexer stopped");out.flush();*/ + +/* out.close();*/ + + } + + public static void connect() throws Exception + { + s = null; + com.fourspaces.couchdb.Session s = new com.fourspaces.couchdb.Session("locahost", 5984); + } + + public static void readConfig() throws Exception + { + //get all docs in /$ftconfig + //return array of config docs + configuration = null; + configuration = new CouchConfig(); + com.fourspaces.couchdb.Database db = s.getDatabase("couchdbfulltext"); + com.fourspaces.couchdb.ViewResults changedDocuments = db.getAllDocuments(0); + + for (com.fourspaces.couchdb.Document d: changedDocuments.getResults()) { + configuration.addDocument(d); + } + +/* for(int i = 0; i < changedDocuments.length; i++) { + CouchDocument document = changedDocuments[i]; + document = loadDocumentData(document, "couchdbfulltext"); + configuration.addDocument(document); + } +*/ } + + public static void indexChanges(String db) throws Exception + { +// System.out.println("Updating index for '" + db + "' from revision: " + revision); + int sequence = -1; + try { + com.fourspaces.couchdb.Database _db = s.getDatabase(db); + sequence = _db.getUpdateSeq(); + com.fourspaces.couchdb.ViewResults changedDocuments = _db.getAllDocuments(sequence); + + if(changedDocuments.size() == 0) { +// System.out.println("Index is up-to date at sequence_id: " + revision); + return; + } + + boolean delete = false; + + for (com.fourspaces.couchdb.Document d: changedDocuments.getResults()) { + delete = d.getBoolean("delete"); + documentAddToIndex(db, d, delete); + } +/* for(int idx = 0; idx < changedDocuments.length; idx++) { + com.fourspaces.couchdb.Document document = changedDocuments[idx]; + sequence = document.getUpdateSequence(); + delete = document.getDelete(); +// System.out.println("Doing: " + document + " with squence: " + sequence + " delete: "+document.getDelete() + " hash code:" + document.hashCode()); + + document = loadDocumentData(document, db); + // System.out.println(changedDocuments[idx]); + // remove from lucene if exists, add to lucene. + + documentAddToIndex(db, document, delete); + } +*/ // CouchDocument document = getDocumentByRevision(db, revision); + setRevisionForDb(db, sequence); + } catch(Exception e) { +// System.out.println("Warning: " + db + " says: " + e.getMessage()); + } + } + + public static void documentAddToIndex(String db, com.fourspaces.couchdb.Document document, boolean delete) throws IOException + { + String index = "Lucene/Index/" + db; + boolean create = true; + +/* System.out.println("DEBUG: delete: " + delete);*/ +/* System.out.println("DEBUG: create index? " + create);*/ + + if(IndexReader.indexExists(index)) { + create = false; + Term term = new Term("__couchdb_document_id", document.getId()); +/* System.out.println("DEBUG: Deleting: " + document + " with term:" + term);*/ + IndexReader reader = IndexReader.open(index); + reader.deleteDocuments(term); +/* System.out.println("DEBUG: reader has deletions: " + reader.hasDeletions());*/ + + reader.close(); + } + + if(!delete) { + Analyzer analyzer = new SimpleAnalyzer(); + + IndexWriter writer = new IndexWriter(index, analyzer, create); + writer.setUseCompoundFile(true); + +/* Collection fields = document.keys();*/ + Document luceneDocument = new Document(); + +/* Set tmpKeys = fields.keySet(); + Object keys[] = tmpKeys.toArray(); +*/ String keywords = ""; + + for (Iterator it = document.keys(); it.hasNext(); ) { + Object key = it.next(); + String value = document.getString((String)key); + + if(key.equals("__couchdb_document_id") || key.equals("__couchdb_document_revision")) { + luceneDocument.add(new Field((String)key, value, Field.Store.YES, Field.Index.UN_TOKENIZED)); + } else { + luceneDocument.add(new Field((String)key, value, Field.Store.YES, Field.Index.TOKENIZED)); + keywords = keywords + " " + value; + } + } + if(keywords.length() > 0) { + luceneDocument.add(new Field("__couchdb_keywords", keywords, Field.Store.YES, Field.Index.TOKENIZED)); + } + + +/* for(int idx = 0; idx < keys.length; idx++) { + // System.out.println("DEBUG: Add Field: "+ keys[idx] + " with value: " + fields.get(keys[idx])); + Hashtable field = (Hashtable)fields.get(keys[idx]); + if(field == null) {return;} + for(int fieldIdx = 0; fieldIdx < field.size(); fieldIdx++) { + String value = (String)field.get(fieldIdx); + if(value == null) { + value = ""; + } + // System.out.println("DEBUG: fieldIdx:" + fieldIdx + " and value: "+ value); + String key = (String)keys[idx]; + if(key.equals("__couchdb_document_id") || key.equals("__couchdb_document_revision")) { + luceneDocument.add(new Field(key, value, Field.Store.YES, Field.Index.UN_TOKENIZED)); + } else { + luceneDocument.add(new Field(key, value, Field.Store.YES, Field.Index.TOKENIZED)); + keywords = keywords + " " + value; + } + } +*/// } + writer.addDocument(luceneDocument); + writer.optimize(); + writer.close(); + } + } + + + private static void setRevisionForDb(String db, long revision) throws Exception + { + File dbFile = new File("Lucene/State/" + db); + + RandomAccessFile stateFile = new RandomAccessFile("Lucene/State/" + db, "rwd"); + stateFile.writeBytes(String.valueOf(revision)); + return; + } + + private static String[] getDBs() + { + File dbRoot = new File("db_root"); + if(!dbRoot.isDirectory()) { + return new String[0]; + } + + String[] dbs = dbRoot.list(new CouchDbDirFilter()); + + return dbs; + } + + private static long getRevisionForDb(String db) throws Exception + { + + File dbFile = new File("Lucene/State/" + db); + if(!dbFile.exists()) { + return 0; + } + + + RandomAccessFile stateFile = new RandomAccessFile("Lucene/State/" + db, "r"); + String revision = stateFile.readLine(); +// System.out.println("rev: " + revision); + return (long)Integer.parseInt(revision); + } + + private static void createIndexDir(String db) + { + File indexDir = new File("Lucene/Index/" + db); + if(!indexDir.exists()) { + indexDir.mkdirs(); + System.out.println("Created Index Directory"); + } + + File stateDir = new File("Lucene/State"); + if(!stateDir.exists()) { + stateDir.mkdirs(); + System.out.println("Created State Directory"); + } + } + + private static XMLReader getParser(SAXCouchDocumentBuilder documentBuilder) throws Exception + { + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser saxParser = factory.newSAXParser(); + XMLReader parser = saxParser.getXMLReader(); + parser.setContentHandler(documentBuilder); + return parser; + } + + private static BufferedInputStream getUrlStream(String address) throws Exception + { + URL url = new URL(address); + InputStream inStream = url.openStream(); + return new BufferedInputStream(inStream); + } + + public static com.fourspaces.couchdb.ViewResults getChangedDocumentsSinceRevision(String db, int revision) throws Exception + { + //BufferedInputStream inBuffer = getUrlStream("http://localhost:5984/" + db + "/_all_docs_by_update_seq?startkey=" + revision); + + com.fourspaces.couchdb.ViewResults newDocs = s.getDatabase(db).getAllDocuments(revision); + + return newDocs; + //return CouchDocument[] + +/* CouchDocument[] returnValue = {}; +*/ //setup xml parser +/* SAXCouchDocumentBuilder documentBuilder = new SAXCouchDocumentBuilder(); + XMLReader parser = getParser(documentBuilder); + // Repeat until end of file + parser.parse(new InputSource(inBuffer)); + + + return documentBuilder.getDocuments(); +*/ } + + + public static CouchDocument loadDocumentData(CouchDocument document, String db) throws Exception + { + BufferedInputStream inBuffer = getUrlStream("http://localhost:5984/" + db + "/" + document.getDocId() + "?rev=" + document.getRevision()); + + //setup xml parser + SAXCouchDocumentBuilder documentBuilder = new SAXCouchDocumentBuilder(); + XMLReader parser = getParser(documentBuilder); + + // Repeat until end of file + parser.parse(new InputSource(inBuffer)); + + return documentBuilder.getDocument(); + } +} diff --git a/src/fulltext/lucene/LuceneSearcher.java b/src/fulltext/lucene/LuceneSearcher.java new file mode 100644 index 00000000..a5ccbe89 --- /dev/null +++ b/src/fulltext/lucene/LuceneSearcher.java @@ -0,0 +1,90 @@ +/* + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy of the +License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the +specific language governing permissions and limitations under the License. + +*/ + +/* + +LuceneSearcher searches a lucene index. + +It is managed by the Apache CouchDB daemon. + +*/ + +//basics +import java.io.*; + +//lucene +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.document.Document; + +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.Query; + +/* +protocol: +Queries will look like this: + +databasename\n +the full text query\n + +Then the java reader will read the lines and respond +by outputing each document result: +ok\n +docid1\n +score1\n +docid2\n +score2\n +docid3\n +score3\n +\n + +or: + +error\n +error_id\n +error message\n + +*/ +public class LuceneSearcher +{ + public static void main(String[] args) throws Exception + { + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + + String db = ""; + String queryString = ""; + + while(((db = in.readLine()) != null) && ((queryString = in.readLine()) != null)) { + + IndexSearcher searcher = new IndexSearcher("Lucene/Index/" + db); + + Query query = new TermQuery(new Term("__couchdb_keywords", queryString)); + + Hits hits = searcher.search(query); + + System.out.println("ok"); + for(int i = 0; i < hits.length(); i++) { + Document d = hits.doc(i); + System.out.println(d.get("__couchdb_document_id")); + System.out.println(hits.score(i)); + } + System.out.println(); + } + } +} diff --git a/src/fulltext/lucene/readme.txt b/src/fulltext/lucene/readme.txt new file mode 100644 index 00000000..c115534c --- /dev/null +++ b/src/fulltext/lucene/readme.txt @@ -0,0 +1,41 @@ +This is still work in progress and has not been integrated into the build +process. Good luck though :) + +This document describes how to use the LuceneIndexer with Apache CouchDB. + +Requirements: +Apache CouchDB 0.6.4 or newer. +Java Development Kit (JDK) 1.5 +Lucene 2.0.0 or newer +couchdb4j (http://code.google.com/p/couchdb4j/) + + +If you don't already have it, +download lucene-core-2.0.0.jar from a mirror +A list of mirrors can be found at +http://www.apache.org/dyn/closer.cgi/lucene/java/ + +Add the following line to your couch.ini: +LuceneServer=/usr/bin/java -cp "./bin/:./lib/lucene-core.jar" LuceneIndexer=... + +Adjust the version number and the path to java, if needed. +If you have lucene installed already, remove the +'-cp "./bin/:./Lucene/lucene-core-2.0.0.jar"' part. + +Put lucene-core.jar and cocuhdb4j.jar into $CouchDbDir/lib + +Launch Apache CouchDB. + +The indexer will populate $CouchDbDir/Lucene/Index with an index for +all documents in all databases. +(indexes per database will be added soon). + +To see that the data is actually stored in there, +use luke from http://www.getopt.org/luke/ + +To use the actual index, you could use the PHP 5 Lucene Demo in the Zend Framework +(http://framework.zend.com) or any other Lucene implementation in your favourite +language. + +If you have any questions, please visit: +http://couchdb.com/CouchDB/CouchDBWeb.nsf/vDissByDate -- cgit v1.2.3