hadoop+lucene-hiyachen-ChinaUnix博客

架构师（云操作系统AI微服务）hiyachen.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

hiyachen

博客访问： 7161029
博文数量： 704
博客积分： 10821
博客等级：上将
技术积分： 12052
用户组：普通用户
注册时间： 2005-12-02 10:41

个人简介

中科院云平台架构师，专注于数字化、智能化，技术方向：云、Linux内核、AI、MES/ERP/CRM/OA、物联网、传感器、大数据、ML、微服务。

文章分类

全部博文（704）

云平台云计算（129）

未分类（0）

openstack（10）

分布式文件系统（3）

网络虚拟化（11）

容器云（1）

KVM-Libvirt（1）
大数据及数据挖掘（48）

spark（6）

算法（3）

hadoop（3）

mongodb（5）
Python（6）

python2（2）

python3（0）
linux-unix（72）

glusterrfs（8）

IPC（0）

文件系统（2）

AIX（1）

unix（34）
Java（170）

安全策略（4）

osgi（0）

AjAx（7）
数据库（70）

postgresql（0）

sqlite（0）

Redis(内存数据库（2）

Mysql（7）

Oracle_DB2_Sqlse（56）
Web（34）
网络与运维（19）

监控（3）
软件及系统架构（70）

金融（0）

移动开发（0）

UML（2）
中间件（2）

CICS（0）

Tuxedo（0）
C/C++（6）
PHP（3）
Others（19）

VBA||C#（3）

regExp（4）

Shell_Dos（5）
BPM（6）

JBPM（2）

工作流（4）
程序与人生（29）

Think（5）

Japan（3）
未分配的博文（21）

推荐博文

相关博文

hadoop+lucene

分类： HADOOP

2016-04-23 13:24:26

Hadoop是Lucene的子项目，现在发展如火如荼，如何利用Hadoop的分布式处理能力，来给Lucene提高建索引的效率呢，如此一来，便能充分利用HDFS的所有优点，但众所周知，HDFS系统，对随机读支持的并不友好，而像Lucene这种全文检索的框架，几乎所有的检索操作，都离不开随机读写的操作，那么如何才能使Lucene结合hadoop完美的工作呢，其实hadoop的版本里，在一个contrib的工具包里面，带了Lucene索引的工具类，不过貌似是用的人很少，散仙没有用过这个，在这里就不多评价了。

在solr4.4之后的项目，里面已经集成了像HDFS写入索引的jar包，如果你是在solr里面，那么很容易就能够，把索引建在HDFS上，只需要在solrconfig.xml里面配置Directory的实现类为HDFSDirectory即可，但是solr4.4里面的jar仅仅支持，最新版的hadoop，也就2.0之后的，直接在1.x的hadoop里使用，会出现异常，这是由于，2.x和1.x的hadoop的API变化，散仙改了部分源码后，可以支持对1.x的hadoop进行索引，查询操作，在文末，散仙会把这几个类，给上传上来，用时，只需把这几个类导入工程即可。

下面看下散仙的测试demo的源码：

package  indexhadoop; 



import hdfs.HdfsDirectory; 



import org.apache.hadoop.conf.Configuration; 

import org.apache.hadoop.fs.Path; 

import org.apache.lucene.analysis.Analyzer; 

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; 

import org.apache.lucene.document.Document; 

import org.apache.lucene.document.Field.Store; 

import org.apache.lucene.document.StringField; 

import org.apache.lucene.document.TextField; 

import org.apache.lucene.index.DirectoryReader; 

import org.apache.lucene.index.IndexReader; 

import org.apache.lucene.index.IndexWriter; 

import org.apache.lucene.index.IndexWriterConfig; 

import org.apache.lucene.index.Term; 

import org.apache.lucene.queryparser.classic.QueryParser; 

import org.apache.lucene.search.IndexSearcher; 

import org.apache.lucene.search.Query; 

import org.apache.lucene.search.ScoreDoc; 

import org.apache.lucene.search.TopDocs; 

import org.apache.lucene.store.Directory; 

import org.apache.lucene.util.Version; 





/** 

* 

* @author qindongliang 

* 将索引存储在HDFS上的demo 

* 支持hadoop1.x的版本 

* 

* **/ 

public class MyIndex { 





public static void main(String[] args)throws Exception { 

//long a=System.currentTimeMillis(); 

  //add(); 

// long b=System.currentTimeMillis(); 

// System.out.println("耗时: "+(b-a)+"毫秒"); 

     query("中国"); 

//delete("3");//删除指定ID的数据 

} 







/*** 

* 得到HDFS的writer 

* 

* **/ 

public static IndexWriter  getIndexWriter() throws Exception{ 



Analyzer  analyzer=new SmartChineseAnalyzer(Version.LUCENE_46); 

IndexWriterConfig    config=new IndexWriterConfig(Version.LUCENE_46, analyzer); 

Configuration conf=new Configuration(); 

//Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt"); 

//Path path=new Path("hdfs://10.2.143.5:9090/root/myfile"); 

Path path=new Path("hdfs://192.168.75.130:9000/root/index"); 

HdfsDirectory directory=new HdfsDirectory(path, conf); 

IndexWriter writer=new IndexWriter(directory, config); 



return writer; 



} 



/** 

* 建索引的方法 

* 

* **/ 

public static void add()throws Exception{ 

      

IndexWriter writer=getIndexWriter(); 





//	doc.add(new StringField("id", "3", Store.YES)); 

//	doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架", Store.YES)); 

//	doc.add(new TextField("content", "今天发工资了吗", Store.YES)); 

//	Document doc2=new Document(); 

//	doc.add(new StringField("id", "4", Store.YES)); 

//	doc2.add(new StringField("name", "今天天气不错呀", Store.YES)); 

//	doc2.add(new TextField("content", "钱存储在银行靠谱吗", Store.YES)); 

//	Document doc3=new Document(); 

//	doc3.add(new StringField("id", "5", Store.YES)); 

//	doc3.add(new StringField("name", "没有根的野草，飘忽的命途！", Store.YES)); 

//	doc3.add(new TextField("content", "你工资多少呀！", Store.YES)); 

//	writer.addDocument(doc); 

//	writer.addDocument(doc2); 

//	writer.addDocument(doc3); 

for(int i=6;i<10000;i++){ 

Document doc=new Document(); 

doc.add(new StringField("id", i+"", Store.YES)); 

doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架"+i, Store.YES)); 

doc.add(new TextField("content", "今天发工资了吗"+i, Store.YES)); 

writer.addDocument(doc); 

if(i%1000==0){ 

writer.commit(); 

} 

} 

writer.forceMerge(1); 

writer.commit(); 

System.out.println("索引10000条数据添加成功!"); 

writer.close(); 

} 



/*** 

* 添加索引 

* 

* **/ 

public static void add(Document d)throws Exception{ 

      

IndexWriter writer=getIndexWriter(); 

writer.addDocument(d); 

writer.forceMerge(1); 

writer.commit(); 

System.out.println("索引10000条数据添加成功!"); 

writer.close(); 

} 



/** 

* 根据指定ID 

* 删除HDFS上的一些数据 

* 

* 

* **/ 

public static void delete(String id)throws Exception{ 





IndexWriter writer=getIndexWriter(); 

writer.deleteDocuments(new Term("id", id));//删除指定ID的数据 

writer.forceMerge(1);//清除已经删除的索引空间 

writer.commit();//提交变化 



System.out.println("id为"+id+"的数据已经删除成功........."); 





} 



/** 

* 检索的方法 

* 

* **/ 

public static void query(String queryTerm)throws Exception{ 

System.out.println("本次检索内容:  "+queryTerm); 

Configuration conf=new Configuration(); 

//Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt"); 

//	Path path=new Path("hdfs://192.168.75.130:9000/root/index"); 

Path path=new Path("hdfs://192.168.75.130:9000/root/output/map1"); 

Directory directory=new HdfsDirectory(path, conf); 

IndexReader reader=DirectoryReader.open(directory); 

System.out.println("总数据量: "+reader.numDocs()); 

long a=System.currentTimeMillis(); 

IndexSearcher searcher=new IndexSearcher(reader); 

QueryParser parse=new QueryParser(Version.LUCENE_46, "city", new SmartChineseAnalyzer(Version.LUCENE_46)); 



Query query=parse.parse(queryTerm); 



TopDocs docs=searcher.search(query, 100); 



System.out.println("本次命中结果:   "+docs.totalHits+"  条" ); 

//	for(ScoreDoc sc:docs.scoreDocs){ 

// 

//	System.out.println("评分:  "+sc.score+"  id : "+searcher.doc(sc.doc).get("id")+"  name:   "+searcher.doc(sc.doc).get("name")+"   字段内容: "+searcher.doc(sc.doc).get("content")); 

// 

//	} 

long b=System.currentTimeMillis(); 

System.out.println("第一次耗时:"+(b-a)+" 毫秒"); 

System.out.println("============================================"); 

long c=System.currentTimeMillis(); 

   query=parse.parse(queryTerm); 



   docs=searcher.search(query, 100); 

System.out.println("本次命中结果:   "+docs.totalHits+"  条" ); 

//	for(ScoreDoc sc:docs.scoreDocs){ 

// 

//	System.out.println("评分:  "+sc.score+"  id : "+searcher.doc(sc.doc).get("id")+"  name:   "+searcher.doc(sc.doc).get("name")+"   字段内容: "+searcher.doc(sc.doc).get("content")); 

// 

//	} 

long d=System.currentTimeMillis(); 

System.out.println("第二次耗时:"+(d-c)+" 毫秒"); 



reader.close(); 

directory.close(); 



System.out.println("检索完毕..............."); 









} 









}

上面是散仙测试的例子，经测试，对HDFS上的lucene索引的增删改查都没问题，但有一点需要注意，lucene结合hadoop，确实能大大提升建索引的速度，但是在检索上却没有任何优势，虽然也可以检索，但是速度比较慢，目前的存储实现，是利用了block cache的缓存特性，能使得检索性能差强人意，但是数据量大的时候，检索性能非常糟糕，这一点到现在还没有任何比较好的解决方法，除非，以后给lucene，或solr，增加类似Hbase的数据结构，如此以来，检索上可能会好很多。

上面的代码能够将索引，写入1.x的hadoop中，后续，散仙会给出，在hadoop2.x中建索引的例子，以及如何使用MapReduce并行建索引。

阅读(4449) | 评论(0) | 转发(0) |

上一篇：spark和redis集群部署

下一篇：大数据之统计学：箱图（boxplot）

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6