summaryrefslogtreecommitdiff
path: root/src/fulltext
diff options
context:
space:
mode:
authorChristopher Lenz <cmlenz@apache.org>2008-03-28 23:32:19 +0000
committerChristopher Lenz <cmlenz@apache.org>2008-03-28 23:32:19 +0000
commit544a38dd45f6a58d34296c6c768afd086eb2ac70 (patch)
treec84cc02340b06aae189cff0dbfaee698f273f1f5 /src/fulltext
parent804cbbe033b8e7a3e8d7058aaf31bdf69ef18ac5 (diff)
Imported trunk.
git-svn-id: https://svn.apache.org/repos/asf/incubator/couchdb/trunk@642432 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/fulltext')
-rw-r--r--src/fulltext/lucene/CouchConfig.java62
-rw-r--r--src/fulltext/lucene/CouchDbDirFilter.java30
-rw-r--r--src/fulltext/lucene/LuceneIndexer.java355
-rw-r--r--src/fulltext/lucene/LuceneSearcher.java90
-rw-r--r--src/fulltext/lucene/readme.txt41
5 files changed, 578 insertions, 0 deletions
diff --git a/src/fulltext/lucene/CouchConfig.java b/src/fulltext/lucene/CouchConfig.java
new file mode 100644
index 00000000..5f4d84ce
--- /dev/null
+++ b/src/fulltext/lucene/CouchConfig.java
@@ -0,0 +1,62 @@
+/*
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use
+this file except in compliance with the License. You may obtain a copy of the
+License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+*/
+
+import java.util.*;
+
+
+class CouchConfig
+{
+/* private CouchDocument[] documents;
+*/
+ private Hashtable documents;
+ private long updateSequence;
+
+ public CouchConfig()
+ {
+ documents = new Hashtable();
+ updateSequence = 0;
+ }
+
+ public void setUpdateSequence(long newUpdateSequence)
+ {
+ updateSequence = newUpdateSequence;
+ }
+
+ public long getUpdateSequence()
+ {
+ return updateSequence;
+ }
+
+ public void addDocument(com.fourspaces.couchdb.Document document)
+ {
+ String field;
+// System.out.println(document);
+ field = document.getString("__couchdb_database");
+// System.out.println(field);
+ if(field != null) {
+ documents.put(field, document);
+ }
+ }
+
+ public Hashtable getDocuments()
+ {
+ return documents;
+ }
+
+ public boolean hasDb(String db)
+ {
+ return documents.containsKey(db);
+ }
+}
diff --git a/src/fulltext/lucene/CouchDbDirFilter.java b/src/fulltext/lucene/CouchDbDirFilter.java
new file mode 100644
index 00000000..6b002ce5
--- /dev/null
+++ b/src/fulltext/lucene/CouchDbDirFilter.java
@@ -0,0 +1,30 @@
+/*
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use
+this file except in compliance with the License. You may obtain a copy of the
+License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+*/
+
+/*
+
+LuceneIndexer creates a lucene index by intrementally fetching changes from a a
+Apache CouchDB server. It is managed by the Apache CouchDB daemon.
+
+*/
+import java.io.*;
+
+class CouchDbDirFilter implements FilenameFilter
+{
+ public boolean accept(File dir, String name)
+ {
+ return new File(dir, name).isFile();
+ }
+}
diff --git a/src/fulltext/lucene/LuceneIndexer.java b/src/fulltext/lucene/LuceneIndexer.java
new file mode 100644
index 00000000..07040610
--- /dev/null
+++ b/src/fulltext/lucene/LuceneIndexer.java
@@ -0,0 +1,355 @@
+/*
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use
+this file except in compliance with the License. You may obtain a copy of the
+License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+*/
+
+/*
+
+LuceneIndexer creates a lucene index by incrementally fetching changes from a a
+Apache CouchDB server. It is managed by the Apache CouchDB daemon.
+
+I know this is Java and there should be a lot of OO going on, but it
+isn't. Sorry about that.
+
+*/
+
+//basics
+import java.io.*;
+import java.net.*;
+import java.util.*;
+import java.nio.channels.FileChannel;
+import java.nio.ByteBuffer;
+import java.lang.reflect.*;
+
+
+//couchdb4j
+//import com.fourspaces.couchdb.*;
+
+//xml
+import org.xml.sax.*;
+import org.xml.sax.helpers.*;
+import javax.xml.parsers.*;
+
+//lucene
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.TermQuery;
+
+public class LuceneIndexer
+{
+ private static CouchConfig configuration;
+ private static com.fourspaces.couchdb.Session s;
+
+ public static void main(String[] args) throws Exception
+ {
+/* BufferedWriter out = new BufferedWriter(new FileWriter("LuceneIndexer.log"));
+ out.write("indexer started");out.flush();
+*/
+ String db;
+/* out.write("indexer about to read config");out.flush();*/
+ connect();
+ readConfig();
+
+/* out.write("indexer read config: " + configuration.getDocuments());out.flush();*/
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ try {
+ while((db = in.readLine()) != null) {
+/* out.write("indexer got a poke");out.flush();*/
+
+ if(db.equals("couchdbfulltext")) {
+/* System.out.println("refresh config");
+
+*/ readConfig();
+/* out.write("indexer refreshed config");out.flush();*/
+
+ }
+
+/* out.write("indexer has table: " + db + "?");*/
+
+ if(!configuration.hasDb(db)) {
+/* out.write("... no wait for input");out.flush();*/
+
+ continue;
+ }
+
+/* out.write("yeppa");out.flush();*/
+
+
+ createIndexDir(db);
+ indexChanges(db);
+/* System.out.println(db + " to revision: " + revision);*/
+ }
+ } catch (IOException e) {
+/* out.write("indexer caught IO exception: " + e.getMessage());out.flush();*/
+
+ }
+/* System.out.println("Lucene Indexer stopped");*/
+/* out.write("indexer stopped");out.flush();*/
+
+/* out.close();*/
+
+ }
+
+ public static void connect() throws Exception
+ {
+ s = null;
+ com.fourspaces.couchdb.Session s = new com.fourspaces.couchdb.Session("locahost", 5984);
+ }
+
+ public static void readConfig() throws Exception
+ {
+ //get all docs in /$ftconfig
+ //return array of config docs
+ configuration = null;
+ configuration = new CouchConfig();
+ com.fourspaces.couchdb.Database db = s.getDatabase("couchdbfulltext");
+ com.fourspaces.couchdb.ViewResults changedDocuments = db.getAllDocuments(0);
+
+ for (com.fourspaces.couchdb.Document d: changedDocuments.getResults()) {
+ configuration.addDocument(d);
+ }
+
+/* for(int i = 0; i < changedDocuments.length; i++) {
+ CouchDocument document = changedDocuments[i];
+ document = loadDocumentData(document, "couchdbfulltext");
+ configuration.addDocument(document);
+ }
+*/ }
+
+ public static void indexChanges(String db) throws Exception
+ {
+// System.out.println("Updating index for '" + db + "' from revision: " + revision);
+ int sequence = -1;
+ try {
+ com.fourspaces.couchdb.Database _db = s.getDatabase(db);
+ sequence = _db.getUpdateSeq();
+ com.fourspaces.couchdb.ViewResults changedDocuments = _db.getAllDocuments(sequence);
+
+ if(changedDocuments.size() == 0) {
+// System.out.println("Index is up-to date at sequence_id: " + revision);
+ return;
+ }
+
+ boolean delete = false;
+
+ for (com.fourspaces.couchdb.Document d: changedDocuments.getResults()) {
+ delete = d.getBoolean("delete");
+ documentAddToIndex(db, d, delete);
+ }
+/* for(int idx = 0; idx < changedDocuments.length; idx++) {
+ com.fourspaces.couchdb.Document document = changedDocuments[idx];
+ sequence = document.getUpdateSequence();
+ delete = document.getDelete();
+// System.out.println("Doing: " + document + " with squence: " + sequence + " delete: "+document.getDelete() + " hash code:" + document.hashCode());
+
+ document = loadDocumentData(document, db);
+ // System.out.println(changedDocuments[idx]);
+ // remove from lucene if exists, add to lucene.
+
+ documentAddToIndex(db, document, delete);
+ }
+*/ // CouchDocument document = getDocumentByRevision(db, revision);
+ setRevisionForDb(db, sequence);
+ } catch(Exception e) {
+// System.out.println("Warning: " + db + " says: " + e.getMessage());
+ }
+ }
+
+ public static void documentAddToIndex(String db, com.fourspaces.couchdb.Document document, boolean delete) throws IOException
+ {
+ String index = "Lucene/Index/" + db;
+ boolean create = true;
+
+/* System.out.println("DEBUG: delete: " + delete);*/
+/* System.out.println("DEBUG: create index? " + create);*/
+
+ if(IndexReader.indexExists(index)) {
+ create = false;
+ Term term = new Term("__couchdb_document_id", document.getId());
+/* System.out.println("DEBUG: Deleting: " + document + " with term:" + term);*/
+ IndexReader reader = IndexReader.open(index);
+ reader.deleteDocuments(term);
+/* System.out.println("DEBUG: reader has deletions: " + reader.hasDeletions());*/
+
+ reader.close();
+ }
+
+ if(!delete) {
+ Analyzer analyzer = new SimpleAnalyzer();
+
+ IndexWriter writer = new IndexWriter(index, analyzer, create);
+ writer.setUseCompoundFile(true);
+
+/* Collection fields = document.keys();*/
+ Document luceneDocument = new Document();
+
+/* Set tmpKeys = fields.keySet();
+ Object keys[] = tmpKeys.toArray();
+*/ String keywords = "";
+
+ for (Iterator it = document.keys(); it.hasNext(); ) {
+ Object key = it.next();
+ String value = document.getString((String)key);
+
+ if(key.equals("__couchdb_document_id") || key.equals("__couchdb_document_revision")) {
+ luceneDocument.add(new Field((String)key, value, Field.Store.YES, Field.Index.UN_TOKENIZED));
+ } else {
+ luceneDocument.add(new Field((String)key, value, Field.Store.YES, Field.Index.TOKENIZED));
+ keywords = keywords + " " + value;
+ }
+ }
+ if(keywords.length() > 0) {
+ luceneDocument.add(new Field("__couchdb_keywords", keywords, Field.Store.YES, Field.Index.TOKENIZED));
+ }
+
+
+/* for(int idx = 0; idx < keys.length; idx++) {
+ // System.out.println("DEBUG: Add Field: "+ keys[idx] + " with value: " + fields.get(keys[idx]));
+ Hashtable field = (Hashtable)fields.get(keys[idx]);
+ if(field == null) {return;}
+ for(int fieldIdx = 0; fieldIdx < field.size(); fieldIdx++) {
+ String value = (String)field.get(fieldIdx);
+ if(value == null) {
+ value = "";
+ }
+ // System.out.println("DEBUG: fieldIdx:" + fieldIdx + " and value: "+ value);
+ String key = (String)keys[idx];
+ if(key.equals("__couchdb_document_id") || key.equals("__couchdb_document_revision")) {
+ luceneDocument.add(new Field(key, value, Field.Store.YES, Field.Index.UN_TOKENIZED));
+ } else {
+ luceneDocument.add(new Field(key, value, Field.Store.YES, Field.Index.TOKENIZED));
+ keywords = keywords + " " + value;
+ }
+ }
+*/// }
+ writer.addDocument(luceneDocument);
+ writer.optimize();
+ writer.close();
+ }
+ }
+
+
+ private static void setRevisionForDb(String db, long revision) throws Exception
+ {
+ File dbFile = new File("Lucene/State/" + db);
+
+ RandomAccessFile stateFile = new RandomAccessFile("Lucene/State/" + db, "rwd");
+ stateFile.writeBytes(String.valueOf(revision));
+ return;
+ }
+
+ private static String[] getDBs()
+ {
+ File dbRoot = new File("db_root");
+ if(!dbRoot.isDirectory()) {
+ return new String[0];
+ }
+
+ String[] dbs = dbRoot.list(new CouchDbDirFilter());
+
+ return dbs;
+ }
+
+ private static long getRevisionForDb(String db) throws Exception
+ {
+
+ File dbFile = new File("Lucene/State/" + db);
+ if(!dbFile.exists()) {
+ return 0;
+ }
+
+
+ RandomAccessFile stateFile = new RandomAccessFile("Lucene/State/" + db, "r");
+ String revision = stateFile.readLine();
+// System.out.println("rev: " + revision);
+ return (long)Integer.parseInt(revision);
+ }
+
+ private static void createIndexDir(String db)
+ {
+ File indexDir = new File("Lucene/Index/" + db);
+ if(!indexDir.exists()) {
+ indexDir.mkdirs();
+ System.out.println("Created Index Directory");
+ }
+
+ File stateDir = new File("Lucene/State");
+ if(!stateDir.exists()) {
+ stateDir.mkdirs();
+ System.out.println("Created State Directory");
+ }
+ }
+
+ private static XMLReader getParser(SAXCouchDocumentBuilder documentBuilder) throws Exception
+ {
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ SAXParser saxParser = factory.newSAXParser();
+ XMLReader parser = saxParser.getXMLReader();
+ parser.setContentHandler(documentBuilder);
+ return parser;
+ }
+
+ private static BufferedInputStream getUrlStream(String address) throws Exception
+ {
+ URL url = new URL(address);
+ InputStream inStream = url.openStream();
+ return new BufferedInputStream(inStream);
+ }
+
+ public static com.fourspaces.couchdb.ViewResults getChangedDocumentsSinceRevision(String db, int revision) throws Exception
+ {
+ //BufferedInputStream inBuffer = getUrlStream("http://localhost:5984/" + db + "/_all_docs_by_update_seq?startkey=" + revision);
+
+ com.fourspaces.couchdb.ViewResults newDocs = s.getDatabase(db).getAllDocuments(revision);
+
+ return newDocs;
+ //return CouchDocument[]
+
+/* CouchDocument[] returnValue = {};
+*/ //setup xml parser
+/* SAXCouchDocumentBuilder documentBuilder = new SAXCouchDocumentBuilder();
+ XMLReader parser = getParser(documentBuilder);
+ // Repeat until end of file
+ parser.parse(new InputSource(inBuffer));
+
+
+ return documentBuilder.getDocuments();
+*/ }
+
+
+ public static CouchDocument loadDocumentData(CouchDocument document, String db) throws Exception
+ {
+ BufferedInputStream inBuffer = getUrlStream("http://localhost:5984/" + db + "/" + document.getDocId() + "?rev=" + document.getRevision());
+
+ //setup xml parser
+ SAXCouchDocumentBuilder documentBuilder = new SAXCouchDocumentBuilder();
+ XMLReader parser = getParser(documentBuilder);
+
+ // Repeat until end of file
+ parser.parse(new InputSource(inBuffer));
+
+ return documentBuilder.getDocument();
+ }
+}
diff --git a/src/fulltext/lucene/LuceneSearcher.java b/src/fulltext/lucene/LuceneSearcher.java
new file mode 100644
index 00000000..a5ccbe89
--- /dev/null
+++ b/src/fulltext/lucene/LuceneSearcher.java
@@ -0,0 +1,90 @@
+/*
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use
+this file except in compliance with the License. You may obtain a copy of the
+License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+*/
+
+/*
+
+LuceneSearcher searches a lucene index.
+
+It is managed by the Apache CouchDB daemon.
+
+*/
+
+//basics
+import java.io.*;
+
+//lucene
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexReader;
+
+import org.apache.lucene.document.Document;
+
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.Query;
+
+/*
+protocol:
+Queries will look like this:
+
+databasename\n
+the full text query\n
+
+Then the java reader will read the lines and respond
+by outputing each document result:
+ok\n
+docid1\n
+score1\n
+docid2\n
+score2\n
+docid3\n
+score3\n
+\n
+
+or:
+
+error\n
+error_id\n
+error message\n
+
+*/
+public class LuceneSearcher
+{
+ public static void main(String[] args) throws Exception
+ {
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+
+ String db = "";
+ String queryString = "";
+
+ while(((db = in.readLine()) != null) && ((queryString = in.readLine()) != null)) {
+
+ IndexSearcher searcher = new IndexSearcher("Lucene/Index/" + db);
+
+ Query query = new TermQuery(new Term("__couchdb_keywords", queryString));
+
+ Hits hits = searcher.search(query);
+
+ System.out.println("ok");
+ for(int i = 0; i < hits.length(); i++) {
+ Document d = hits.doc(i);
+ System.out.println(d.get("__couchdb_document_id"));
+ System.out.println(hits.score(i));
+ }
+ System.out.println();
+ }
+ }
+}
diff --git a/src/fulltext/lucene/readme.txt b/src/fulltext/lucene/readme.txt
new file mode 100644
index 00000000..c115534c
--- /dev/null
+++ b/src/fulltext/lucene/readme.txt
@@ -0,0 +1,41 @@
+This is still work in progress and has not been integrated into the build
+process. Good luck though :)
+
+This document describes how to use the LuceneIndexer with Apache CouchDB.
+
+Requirements:
+Apache CouchDB 0.6.4 or newer.
+Java Development Kit (JDK) 1.5
+Lucene 2.0.0 or newer
+couchdb4j (http://code.google.com/p/couchdb4j/)
+
+
+If you don't already have it,
+download lucene-core-2.0.0.jar from a mirror
+A list of mirrors can be found at
+http://www.apache.org/dyn/closer.cgi/lucene/java/
+
+Add the following line to your couch.ini:
+LuceneServer=/usr/bin/java -cp "./bin/:./lib/lucene-core.jar" LuceneIndexer=...
+
+Adjust the version number and the path to java, if needed.
+If you have lucene installed already, remove the
+'-cp "./bin/:./Lucene/lucene-core-2.0.0.jar"' part.
+
+Put lucene-core.jar and cocuhdb4j.jar into $CouchDbDir/lib
+
+Launch Apache CouchDB.
+
+The indexer will populate $CouchDbDir/Lucene/Index with an index for
+all documents in all databases.
+(indexes per database will be added soon).
+
+To see that the data is actually stored in there,
+use luke from http://www.getopt.org/luke/
+
+To use the actual index, you could use the PHP 5 Lucene Demo in the Zend Framework
+(http://framework.zend.com) or any other Lucene implementation in your favourite
+language.
+
+If you have any questions, please visit:
+http://couchdb.com/CouchDB/CouchDBWeb.nsf/vDissByDate