一些Lucene中的跨度查询的例子-0MXvABcUx0YPuDg-ChinaUnix博客

linux乐园37181

首页　| 　博文目录　| 　关于我

0MXvABcUx0YPuDg

博客访问： 524382
博文数量： 1496
博客积分： 79800
博客等级：大将
技术积分： 9940
用户组：普通用户
注册时间： 2008-09-09 13:22

文章分类

全部博文（1496）

未分配的博文（1496）

文章存档

2011年（1）

2008年（1495）

我的朋友

最近访客

推荐博文

一些Lucene中的跨度查询的例子

分类：

2008-09-09 13:30:13

SpanQuery按照词在文章中的距离或者查询几个相邻词的查询

SpanQuery包括以下几种：

SpanTermQuery：词距查询的基础，结果和TermQuery相似，只不过是增加了查询结果中单词的距离信息。

SpanFirstQuery：在指定距离可以找到第一个单词的查询。

SpanNearQuery：查询的几个语句之间保持者一定的距离。

SpanOrQuery：同时查询几个词句查询。

SpanNotQuery：从一个词距查询结果中，去除一个词距查询。

示例代码如下：

view plaincopy to clipboardprint?
package com.lucene.search;

    //SpanQuery：跨度查询。此类为抽象类。
    import java.io.IOException;
    import java.io.StringReader;
    import java.util.ArrayList;
    import java.util.List;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.Token;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.WhitespaceAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.Field.Index;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.search.Hits;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.spans.SpanFirstQuery;
    import org.apache.lucene.search.spans.SpanNearQuery;
    import org.apache.lucene.search.spans.SpanNotQuery;
    import org.apache.lucene.search.spans.SpanOrQuery;
    import org.apache.lucene.search.spans.SpanQuery;
    import org.apache.lucene.search.spans.SpanTermQuery;
    import org.apache.lucene.search.spans.Spans;
    import org.apache.lucene.store.RAMDirectory;

public class SpanQueryTest {

        private RAMDirectory directory;
        private IndexSearcher indexSearcher;
        private IndexReader reader;
        private SpanTermQuery quick;
        private SpanTermQuery brown;
        private SpanTermQuery red;
        private SpanTermQuery fox;
        private SpanTermQuery lazy;
        private SpanTermQuery sleepy;
        private SpanTermQuery dog;
        private SpanTermQuery cat;
        private Analyzer analyzer;

// 索引及初使化
public void index() throws IOException {

            directory = new RAMDirectory();
            analyzer = new WhitespaceAnalyzer();
            IndexWriter writer = new IndexWriter(directory, analyzer, true);
            Document doc1 = new Document();
            doc1.add(new Field("field", "the quick brown fox jumps over the lazy dog", Store.YES, Index.TOKENIZED));
            Document doc2 = new Document();
            doc2.add(new Field("field", "the quick red fox jumps over the sleepy cat", Store.YES, Index.TOKENIZED));
            writer.addDocument(doc1);
            writer.addDocument(doc2);
            writer.optimize();
            writer.close();

            quick = new SpanTermQuery(new Term("field", "quick"));
            brown = new SpanTermQuery(new Term("field", "brown"));
            red = new SpanTermQuery(new Term("field", "red"));
            fox = new SpanTermQuery(new Term("field", "fox"));
            lazy = new SpanTermQuery(new Term("field", "lazy"));
            sleepy = new SpanTermQuery(new Term("field", "sleepy"));
            dog = new SpanTermQuery(new Term("field", "dog"));
            cat = new SpanTermQuery(new Term("field", "cat"));
            indexSearcher = new IndexSearcher(directory);
            reader = IndexReader.open(directory);
        }

private void dumpSpans(SpanQuery query) throws IOException {

            // 检索效果和TermQuery一样,可以把他当成TermQuery
            Hits hits = indexSearcher.search(query);
            for (int i = 0; i < hits.length(); i++) {
                // System.out.println(hits.doc(i).get("field"));
            }

            // 但内部会记录一些位置信息，供SpanQuery的其它API使用，是其它属于SpanQuery的Query的基础。
            Spans spans = query.getSpans(reader);
            int numSpans = 0;
            float[] scores = new float[2];
            for (int i = 0; i < hits.length(); i++) {
                scores[hits.id(i)] = hits.score(i);
            }

            while (spans.next()) {
                numSpans++;
                int id = spans.doc();
                Document doc = reader.document(id);
                Token[] tokens = AnalyzerUtils.tokensFromAnalysis(analyzer, doc.get("field"));
                StringBuffer buffer = new StringBuffer();
                for (int i = 0; i < tokens.length; i++) {
                    // the quick brown fox jumps over the lazy dog
                    // spans记录了位置信息,比如搜索brown,brown在这句话中位于第三个位置,所以spans.start()=2,spans.end()=3
                    // 在第二项的位置后加<,第三项后加> 返回
                    if (i == spans.start()) {
                        buffer.append("<");
                    }
                    buffer.append(tokens[i].termText());
                    if (i + 1 == spans.end()) {
                        buffer.append(">");
                    }
                    buffer.append(" ");
                }
                buffer.append("(" + scores[id] + ") ");
                System.out.println(buffer);
            }
            // indexSearcher.close();
        }

        // SpanTermQuery：检索效果完全同TermQuery，但内部会记录一些位置信息，供SpanQuery的其它API使用，是其它属于SpanQuery的Query的基础。
        public void spanTermQueryTest() throws IOException {
            dumpSpans(brown);

            //// 搜索结果
            // the quick fox jumps over the lazy dog (0.22097087)
        }

        // SpanFirstQuery：查找方式为从Field的内容起始位置开始，在一个固定的宽度内查找所指定的词条。
        public void spanFirstQueryTest() throws IOException {
            // the quick brown fox jumps over the lazy dog
            // 在给定的范围搜索,前两个为the quick
            // brown 在doc1的第三个位置,用SpanFirstQuery从起点查找的话,他的跨度必须为>=3才能找到
            SpanFirstQuery firstQuery = new SpanFirstQuery(brown, 3);
            dumpSpans(firstQuery);

            ////搜索结果
            // the quick fox jumps over the lazy dog (0.22097087)
        }

        // SpanNearQuery：功能类似PharaseQuery。SpanNearQuery查找所匹配的不一定是短语，还有可能是另一个SpanQuery的查询结果作为整体考虑，进行嵌套查询。
        public void spanNearQueryTest() throws IOException {
            // the quick brown fox jumps over the lazy dog

            // 第二个参数为两个项的位置之间允许的最大间隔
            // 在这里两个较远的项为quick和fox,他们之是的最大间隔为5,所以slop必须>=5才能搜到结果
            SpanNearQuery nearQuery = new SpanNearQuery(new SpanQuery[] { quick, brown, fox }, 5, true);

dumpSpans(nearQuery);

            // 与PhraseQuery短语搜索相似
            // 这里搜索quick,dog,brown,要想得到结果,就要将brown向后移动5个位置才能到dog的后面,所以slop要>=5才能找到结果
            // 第三个参数,如果为true表示保持各项位置不变,顺序搜索
            nearQuery = new SpanNearQuery(new SpanQuery[] { quick, dog, brown }, 5, false);

dumpSpans(nearQuery);

            //////搜索结果/////
            // 第一个dumpSpans的结果 the jumps over the lazy dog (0.34204215)
            // 第二个dumpSpans的结果 the (0.27026406)
        }

// 从第一个SpanQuery查询结果中，去掉第二个SpanQuery查询结果，作为检索结果
public void spanNotQueryTest() throws IOException {

// the quick brown fox jumps over the lazy dog
SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[] { quick, fox }, 1, true);

// 结果为quick brown fox 和 quick red fox
dumpSpans(quick_fox);

// SpanNotQuery quick_fox_dog = new SpanNotQuery(quick_fox, dog);
// dumpSpans(quick_fox_dog);

            // 在quick_fox结果中,去掉red,结果为quick brown fox
            SpanNotQuery no_quick_red_fox = new SpanNotQuery(quick_fox, red);
            dumpSpans(no_quick_red_fox);

            //////搜索结果///////第一个dumpSpans结果为前两条,第二个dumpSpans结果为第三条
            //the jumps over the lazy dog (0.18579213)
            //the jumps over the sleepy cat (0.18579213)
            //the jumps over the lazy dog (0.18579213)
        }

// SpanOrQuery：把所有SpanQuery查询结果综合起来，作为检索结果。
public void spanOrQueryTest() throws IOException {

            SpanNearQuery quick_fox = new SpanNearQuery(new SpanQuery[] { quick, fox }, 1, true);
            SpanNearQuery lazy_dog = new SpanNearQuery(new SpanQuery[] { lazy, dog }, 0, true);
            SpanNearQuery sleepy_cat = new SpanNearQuery(new SpanQuery[] { sleepy, cat }, 0, true);
            SpanNearQuery qf_near_ld = new SpanNearQuery(new SpanQuery[] { quick_fox, lazy_dog }, 3, true);
            dumpSpans(qf_near_ld);
            SpanNearQuery qf_near_sc = new SpanNearQuery(new SpanQuery[] { quick_fox, sleepy_cat }, 3, true);
            dumpSpans(qf_near_sc);
            SpanOrQuery or = new SpanOrQuery(new SpanQuery[] { qf_near_ld, qf_near_sc });
            dumpSpans(or);

            /////////搜索结果第一个dumpSpans结果为第一条,第二个为第二条,第三个为第三,四条
            // the (0.3321948)
            // the (0.3321948)
            // the (0.5405281)
            // the (0.5405281)
        }

public static void main(String[] args) throws IOException {

            SpanQueryTest test = new SpanQueryTest();
            test.index();
            test.spanOrQueryTest();
        }
    }

class AnalyzerUtils {

public static Token[] tokensFromAnalysis(Analyzer analyzer, String text) throws IOException {

            TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
            boolean b = true;
            List list = new ArrayList();
            while (b) {
                Token token = stream.next();
                if (token == null) b = false; else list.add(token);
            }
            return (Token[]) list.toArray(new Token[0]);
        }
    }

【责编:landy】

--------------------next---------------------

阅读(125) | 评论(0) | 转发(0) |

上一篇：用JNI实现一个高精度的Java计时器

下一篇：Lucene在多个索引上进行搜索

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6