diff options
Diffstat (limited to 'src/fulltext')
-rw-r--r-- | src/fulltext/lucene/CouchConfig.java | 62 | ||||
-rw-r--r-- | src/fulltext/lucene/CouchDbDirFilter.java | 30 | ||||
-rw-r--r-- | src/fulltext/lucene/LuceneIndexer.java | 355 | ||||
-rw-r--r-- | src/fulltext/lucene/LuceneSearcher.java | 90 | ||||
-rw-r--r-- | src/fulltext/lucene/readme.txt | 41 |
5 files changed, 0 insertions, 578 deletions
diff --git a/src/fulltext/lucene/CouchConfig.java b/src/fulltext/lucene/CouchConfig.java deleted file mode 100644 index 5f4d84ce..00000000 --- a/src/fulltext/lucene/CouchConfig.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - -Licensed under the Apache License, Version 2.0 (the "License"); you may not use -this file except in compliance with the License. You may obtain a copy of the -License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software distributed -under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -CONDITIONS OF ANY KIND, either express or implied. See the License for the -specific language governing permissions and limitations under the License. - -*/ - -import java.util.*; - - -class CouchConfig -{ -/* private CouchDocument[] documents; -*/ - private Hashtable documents; - private long updateSequence; - - public CouchConfig() - { - documents = new Hashtable(); - updateSequence = 0; - } - - public void setUpdateSequence(long newUpdateSequence) - { - updateSequence = newUpdateSequence; - } - - public long getUpdateSequence() - { - return updateSequence; - } - - public void addDocument(com.fourspaces.couchdb.Document document) - { - String field; -// System.out.println(document); - field = document.getString("__couchdb_database"); -// System.out.println(field); - if(field != null) { - documents.put(field, document); - } - } - - public Hashtable getDocuments() - { - return documents; - } - - public boolean hasDb(String db) - { - return documents.containsKey(db); - } -} diff --git a/src/fulltext/lucene/CouchDbDirFilter.java b/src/fulltext/lucene/CouchDbDirFilter.java deleted file mode 100644 index 6b002ce5..00000000 --- a/src/fulltext/lucene/CouchDbDirFilter.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - -Licensed under the Apache License, Version 2.0 (the "License"); you may not use -this file except in compliance with the License. You may obtain a copy of the -License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software distributed -under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -CONDITIONS OF ANY KIND, either express or implied. See the License for the -specific language governing permissions and limitations under the License. - -*/ - -/* - -LuceneIndexer creates a lucene index by intrementally fetching changes from a a -Apache CouchDB server. It is managed by the Apache CouchDB daemon. - -*/ -import java.io.*; - -class CouchDbDirFilter implements FilenameFilter -{ - public boolean accept(File dir, String name) - { - return new File(dir, name).isFile(); - } -} diff --git a/src/fulltext/lucene/LuceneIndexer.java b/src/fulltext/lucene/LuceneIndexer.java deleted file mode 100644 index 07040610..00000000 --- a/src/fulltext/lucene/LuceneIndexer.java +++ /dev/null @@ -1,355 +0,0 @@ -/* - -Licensed under the Apache License, Version 2.0 (the "License"); you may not use -this file except in compliance with the License. You may obtain a copy of the -License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software distributed -under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -CONDITIONS OF ANY KIND, either express or implied. See the License for the -specific language governing permissions and limitations under the License. - -*/ - -/* - -LuceneIndexer creates a lucene index by incrementally fetching changes from a a -Apache CouchDB server. It is managed by the Apache CouchDB daemon. - -I know this is Java and there should be a lot of OO going on, but it -isn't. Sorry about that. - -*/ - -//basics -import java.io.*; -import java.net.*; -import java.util.*; -import java.nio.channels.FileChannel; -import java.nio.ByteBuffer; -import java.lang.reflect.*; - - -//couchdb4j -//import com.fourspaces.couchdb.*; - -//xml -import org.xml.sax.*; -import org.xml.sax.helpers.*; -import javax.xml.parsers.*; - -//lucene -import org.apache.lucene.index.Term; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexReader; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.SimpleAnalyzer; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Hits; -import org.apache.lucene.search.TermQuery; - -public class LuceneIndexer -{ - private static CouchConfig configuration; - private static com.fourspaces.couchdb.Session s; - - public static void main(String[] args) throws Exception - { -/* BufferedWriter out = new BufferedWriter(new FileWriter("LuceneIndexer.log")); - out.write("indexer started");out.flush(); -*/ - String db; -/* out.write("indexer about to read config");out.flush();*/ - connect(); - readConfig(); - -/* out.write("indexer read config: " + configuration.getDocuments());out.flush();*/ - - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); - try { - while((db = in.readLine()) != null) { -/* out.write("indexer got a poke");out.flush();*/ - - if(db.equals("couchdbfulltext")) { -/* System.out.println("refresh config"); - -*/ readConfig(); -/* out.write("indexer refreshed config");out.flush();*/ - - } - -/* out.write("indexer has table: " + db + "?");*/ - - if(!configuration.hasDb(db)) { -/* out.write("... no wait for input");out.flush();*/ - - continue; - } - -/* out.write("yeppa");out.flush();*/ - - - createIndexDir(db); - indexChanges(db); -/* System.out.println(db + " to revision: " + revision);*/ - } - } catch (IOException e) { -/* out.write("indexer caught IO exception: " + e.getMessage());out.flush();*/ - - } -/* System.out.println("Lucene Indexer stopped");*/ -/* out.write("indexer stopped");out.flush();*/ - -/* out.close();*/ - - } - - public static void connect() throws Exception - { - s = null; - com.fourspaces.couchdb.Session s = new com.fourspaces.couchdb.Session("locahost", 5984); - } - - public static void readConfig() throws Exception - { - //get all docs in /$ftconfig - //return array of config docs - configuration = null; - configuration = new CouchConfig(); - com.fourspaces.couchdb.Database db = s.getDatabase("couchdbfulltext"); - com.fourspaces.couchdb.ViewResults changedDocuments = db.getAllDocuments(0); - - for (com.fourspaces.couchdb.Document d: changedDocuments.getResults()) { - configuration.addDocument(d); - } - -/* for(int i = 0; i < changedDocuments.length; i++) { - CouchDocument document = changedDocuments[i]; - document = loadDocumentData(document, "couchdbfulltext"); - configuration.addDocument(document); - } -*/ } - - public static void indexChanges(String db) throws Exception - { -// System.out.println("Updating index for '" + db + "' from revision: " + revision); - int sequence = -1; - try { - com.fourspaces.couchdb.Database _db = s.getDatabase(db); - sequence = _db.getUpdateSeq(); - com.fourspaces.couchdb.ViewResults changedDocuments = _db.getAllDocuments(sequence); - - if(changedDocuments.size() == 0) { -// System.out.println("Index is up-to date at sequence_id: " + revision); - return; - } - - boolean delete = false; - - for (com.fourspaces.couchdb.Document d: changedDocuments.getResults()) { - delete = d.getBoolean("delete"); - documentAddToIndex(db, d, delete); - } -/* for(int idx = 0; idx < changedDocuments.length; idx++) { - com.fourspaces.couchdb.Document document = changedDocuments[idx]; - sequence = document.getUpdateSequence(); - delete = document.getDelete(); -// System.out.println("Doing: " + document + " with squence: " + sequence + " delete: "+document.getDelete() + " hash code:" + document.hashCode()); - - document = loadDocumentData(document, db); - // System.out.println(changedDocuments[idx]); - // remove from lucene if exists, add to lucene. - - documentAddToIndex(db, document, delete); - } -*/ // CouchDocument document = getDocumentByRevision(db, revision); - setRevisionForDb(db, sequence); - } catch(Exception e) { -// System.out.println("Warning: " + db + " says: " + e.getMessage()); - } - } - - public static void documentAddToIndex(String db, com.fourspaces.couchdb.Document document, boolean delete) throws IOException - { - String index = "Lucene/Index/" + db; - boolean create = true; - -/* System.out.println("DEBUG: delete: " + delete);*/ -/* System.out.println("DEBUG: create index? " + create);*/ - - if(IndexReader.indexExists(index)) { - create = false; - Term term = new Term("__couchdb_document_id", document.getId()); -/* System.out.println("DEBUG: Deleting: " + document + " with term:" + term);*/ - IndexReader reader = IndexReader.open(index); - reader.deleteDocuments(term); -/* System.out.println("DEBUG: reader has deletions: " + reader.hasDeletions());*/ - - reader.close(); - } - - if(!delete) { - Analyzer analyzer = new SimpleAnalyzer(); - - IndexWriter writer = new IndexWriter(index, analyzer, create); - writer.setUseCompoundFile(true); - -/* Collection fields = document.keys();*/ - Document luceneDocument = new Document(); - -/* Set tmpKeys = fields.keySet(); - Object keys[] = tmpKeys.toArray(); -*/ String keywords = ""; - - for (Iterator it = document.keys(); it.hasNext(); ) { - Object key = it.next(); - String value = document.getString((String)key); - - if(key.equals("__couchdb_document_id") || key.equals("__couchdb_document_revision")) { - luceneDocument.add(new Field((String)key, value, Field.Store.YES, Field.Index.UN_TOKENIZED)); - } else { - luceneDocument.add(new Field((String)key, value, Field.Store.YES, Field.Index.TOKENIZED)); - keywords = keywords + " " + value; - } - } - if(keywords.length() > 0) { - luceneDocument.add(new Field("__couchdb_keywords", keywords, Field.Store.YES, Field.Index.TOKENIZED)); - } - - -/* for(int idx = 0; idx < keys.length; idx++) { - // System.out.println("DEBUG: Add Field: "+ keys[idx] + " with value: " + fields.get(keys[idx])); - Hashtable field = (Hashtable)fields.get(keys[idx]); - if(field == null) {return;} - for(int fieldIdx = 0; fieldIdx < field.size(); fieldIdx++) { - String value = (String)field.get(fieldIdx); - if(value == null) { - value = ""; - } - // System.out.println("DEBUG: fieldIdx:" + fieldIdx + " and value: "+ value); - String key = (String)keys[idx]; - if(key.equals("__couchdb_document_id") || key.equals("__couchdb_document_revision")) { - luceneDocument.add(new Field(key, value, Field.Store.YES, Field.Index.UN_TOKENIZED)); - } else { - luceneDocument.add(new Field(key, value, Field.Store.YES, Field.Index.TOKENIZED)); - keywords = keywords + " " + value; - } - } -*/// } - writer.addDocument(luceneDocument); - writer.optimize(); - writer.close(); - } - } - - - private static void setRevisionForDb(String db, long revision) throws Exception - { - File dbFile = new File("Lucene/State/" + db); - - RandomAccessFile stateFile = new RandomAccessFile("Lucene/State/" + db, "rwd"); - stateFile.writeBytes(String.valueOf(revision)); - return; - } - - private static String[] getDBs() - { - File dbRoot = new File("db_root"); - if(!dbRoot.isDirectory()) { - return new String[0]; - } - - String[] dbs = dbRoot.list(new CouchDbDirFilter()); - - return dbs; - } - - private static long getRevisionForDb(String db) throws Exception - { - - File dbFile = new File("Lucene/State/" + db); - if(!dbFile.exists()) { - return 0; - } - - - RandomAccessFile stateFile = new RandomAccessFile("Lucene/State/" + db, "r"); - String revision = stateFile.readLine(); -// System.out.println("rev: " + revision); - return (long)Integer.parseInt(revision); - } - - private static void createIndexDir(String db) - { - File indexDir = new File("Lucene/Index/" + db); - if(!indexDir.exists()) { - indexDir.mkdirs(); - System.out.println("Created Index Directory"); - } - - File stateDir = new File("Lucene/State"); - if(!stateDir.exists()) { - stateDir.mkdirs(); - System.out.println("Created State Directory"); - } - } - - private static XMLReader getParser(SAXCouchDocumentBuilder documentBuilder) throws Exception - { - SAXParserFactory factory = SAXParserFactory.newInstance(); - SAXParser saxParser = factory.newSAXParser(); - XMLReader parser = saxParser.getXMLReader(); - parser.setContentHandler(documentBuilder); - return parser; - } - - private static BufferedInputStream getUrlStream(String address) throws Exception - { - URL url = new URL(address); - InputStream inStream = url.openStream(); - return new BufferedInputStream(inStream); - } - - public static com.fourspaces.couchdb.ViewResults getChangedDocumentsSinceRevision(String db, int revision) throws Exception - { - //BufferedInputStream inBuffer = getUrlStream("http://localhost:5984/" + db + "/_all_docs_by_update_seq?startkey=" + revision); - - com.fourspaces.couchdb.ViewResults newDocs = s.getDatabase(db).getAllDocuments(revision); - - return newDocs; - //return CouchDocument[] - -/* CouchDocument[] returnValue = {}; -*/ //setup xml parser -/* SAXCouchDocumentBuilder documentBuilder = new SAXCouchDocumentBuilder(); - XMLReader parser = getParser(documentBuilder); - // Repeat until end of file - parser.parse(new InputSource(inBuffer)); - - - return documentBuilder.getDocuments(); -*/ } - - - public static CouchDocument loadDocumentData(CouchDocument document, String db) throws Exception - { - BufferedInputStream inBuffer = getUrlStream("http://localhost:5984/" + db + "/" + document.getDocId() + "?rev=" + document.getRevision()); - - //setup xml parser - SAXCouchDocumentBuilder documentBuilder = new SAXCouchDocumentBuilder(); - XMLReader parser = getParser(documentBuilder); - - // Repeat until end of file - parser.parse(new InputSource(inBuffer)); - - return documentBuilder.getDocument(); - } -} diff --git a/src/fulltext/lucene/LuceneSearcher.java b/src/fulltext/lucene/LuceneSearcher.java deleted file mode 100644 index a5ccbe89..00000000 --- a/src/fulltext/lucene/LuceneSearcher.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - -Licensed under the Apache License, Version 2.0 (the "License"); you may not use -this file except in compliance with the License. You may obtain a copy of the -License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software distributed -under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -CONDITIONS OF ANY KIND, either express or implied. See the License for the -specific language governing permissions and limitations under the License. - -*/ - -/* - -LuceneSearcher searches a lucene index. - -It is managed by the Apache CouchDB daemon. - -*/ - -//basics -import java.io.*; - -//lucene -import org.apache.lucene.index.Term; -import org.apache.lucene.index.IndexReader; - -import org.apache.lucene.document.Document; - -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Hits; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.Query; - -/* -protocol: -Queries will look like this: - -databasename\n -the full text query\n - -Then the java reader will read the lines and respond -by outputing each document result: -ok\n -docid1\n -score1\n -docid2\n -score2\n -docid3\n -score3\n -\n - -or: - -error\n -error_id\n -error message\n - -*/ -public class LuceneSearcher -{ - public static void main(String[] args) throws Exception - { - - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); - - String db = ""; - String queryString = ""; - - while(((db = in.readLine()) != null) && ((queryString = in.readLine()) != null)) { - - IndexSearcher searcher = new IndexSearcher("Lucene/Index/" + db); - - Query query = new TermQuery(new Term("__couchdb_keywords", queryString)); - - Hits hits = searcher.search(query); - - System.out.println("ok"); - for(int i = 0; i < hits.length(); i++) { - Document d = hits.doc(i); - System.out.println(d.get("__couchdb_document_id")); - System.out.println(hits.score(i)); - } - System.out.println(); - } - } -} diff --git a/src/fulltext/lucene/readme.txt b/src/fulltext/lucene/readme.txt deleted file mode 100644 index c115534c..00000000 --- a/src/fulltext/lucene/readme.txt +++ /dev/null @@ -1,41 +0,0 @@ -This is still work in progress and has not been integrated into the build -process. Good luck though :) - -This document describes how to use the LuceneIndexer with Apache CouchDB. - -Requirements: -Apache CouchDB 0.6.4 or newer. -Java Development Kit (JDK) 1.5 -Lucene 2.0.0 or newer -couchdb4j (http://code.google.com/p/couchdb4j/) - - -If you don't already have it, -download lucene-core-2.0.0.jar from a mirror -A list of mirrors can be found at -http://www.apache.org/dyn/closer.cgi/lucene/java/ - -Add the following line to your couch.ini: -LuceneServer=/usr/bin/java -cp "./bin/:./lib/lucene-core.jar" LuceneIndexer=... - -Adjust the version number and the path to java, if needed. -If you have lucene installed already, remove the -'-cp "./bin/:./Lucene/lucene-core-2.0.0.jar"' part. - -Put lucene-core.jar and cocuhdb4j.jar into $CouchDbDir/lib - -Launch Apache CouchDB. - -The indexer will populate $CouchDbDir/Lucene/Index with an index for -all documents in all databases. -(indexes per database will be added soon). - -To see that the data is actually stored in there, -use luke from http://www.getopt.org/luke/ - -To use the actual index, you could use the PHP 5 Lucene Demo in the Zend Framework -(http://framework.zend.com) or any other Lucene implementation in your favourite -language. - -If you have any questions, please visit: -http://couchdb.com/CouchDB/CouchDBWeb.nsf/vDissByDate |