我把我blog的数据(中英文混合)导出, 作为数据来源, 来说明sphinx的使用.
准备数据源导入数据:
- mysql -u root -p test < wp_posts.sql
配置Sphinx配置data source:
- source blog
-
{
-
type = mysql
-
sql_host = localhost
-
sql_user = root
-
sql_pass = xxxx
-
sql_db = test
-
sql_port = 3306
-
sql_query = \
-
SELECT ID, post_author, UNIX_TIMESTAMP(post_date) as date_added, post_content from wp_posts
-
sql_attr_uint = post_author
-
sql_attr_timestamp = date_added
-
sql_query_info = SELECT * FROM wp_posts where ID=$id
- }
-
配置index:
-
-
index blog
-
{
-
source = blog
-
path = /usr/local/sphinx/var/data/blog
-
docinfo = extern
-
charset_type = zh_cn.utf-8
-
charset_dictpath = /usr/local/sphinx/dict
-
}
注意: 字典的目录要加上.
索引- ./bin/indexer --config ./etc/sphinx.conf blog
如果没有错误, 会看到:
- Coreseek Full Text Server 3.0(beta)
-
Copyright (c) 2006-2008 coreseek.com
-
using config file './etc/sphinx.conf'...
-
indexing index 'blog'...
-
collected 165 docs, 0.2 MB
-
sorted 0.0 Mhits, 100.0% done
-
total 165 docs, 164834 bytes
-
total 0.099 sec, 1670067.52 bytes/sec, 1671.75 docs/sec
测试搜索
- ./bin/search -c ./etc/sphinx.conf -i blog 苹果
输出:
- Coreseek Full Text Server 3.0(beta)
-
Copyright (c) 2006-2008 coreseek.com
-
using config file './etc/sphinx.conf'...
-
0x815e1f8index 'blog': query '苹果 ': returned 1 matches of 1 total in 0.005 sec
-
-
displaying matches:
-
1. document=140, weight=1, post_author=1, date_added=Fri Nov 30 11:22:02 2007
-
ID=140
-
post_author=1
-
post_date=2007-11-30 11:22:02
-
post_date_gmt=2007-11-30 03:22:02
-
post_content=This is the text of the Commencement address by Steve Jobs, CEO of Apple Computer and of Pixar Animation Studios, delivered on June 12, 2005.
-
-
....
-
{中间的blog贴内容省略}
-
....
-
-
post_title=You've got to find what you love
-
post_category=0
-
post_excerpt=
-
post_status=publish
-
comment_status=open
-
ping_status=open
-
post_password=
-
post_name=youve-got-to-find-what-you-love
-
to_ping=
-
pinged=
-
post_modified=2008-06-20 11:58:53
-
post_modified_gmt=2008-06-20 03:58:53
-
post_content_filtered=
-
post_parent=0
-
guid=http://blog.funcat.cn/?p=164
-
menu_order=0
-
post_type=post
-
post_mime_type=
-
comment_count=0
- words:
-
1. '苹果': 1 documents, 10 hits
启动searchd- ./bin/searchd
-
-
Coreseek Full Text Server 3.0(beta)
-
Copyright (c) 2006-2008 coreseek.com
-
using config file '/usr/local/sphinx/etc/sphinx.conf'...
使用调用API
- #!/usr/bin/python
-
# -*- coding:utf-8 -*-
-
-
import sys
-
if sys.getdefaultencoding() != 'utf-8':
-
reload(sys)
-
sys.setdefaultencoding('utf-8')
-
-
import web
-
from web.contrib.template import render_mako
-
-
import MySQLdb
-
from MySQLdb import *
-
-
from sphinxapi import *
-
-
urls = (
-
'/', 'index',
-
)
-
-
render = render_mako(
-
directories=['templates'],
-
input_encoding='utf-8',
-
output_encoding='utf-8',
-
)
-
-
app = web.application(urls, globals())
-
con = MySQLdb.Connect(host="localhost", port=3306, user="root", passwd="xixihaha", db="blogdata")
-
-
class index:
-
def GET(self):
-
r_info = ''
-
info = ''
-
s_result = ''
-
return render.index(r_info=r_info, e_info=info, s_result=s_result)
-
-
def POST(self):
-
i = web.input()
-
if i.keyword == '':
-
raise web.seeohter('/')
-
-
e_info = ''
-
r_info = ''
-
s_result = ''
-
-
q = i.keyword
-
-
cl = SphinxClient()
-
cl.SetServer ( 'localhost', 3312 )
-
res = cl.Query ( q, 'blog' )
-
if not res:
-
e_info = 'query failed: %s' % cl.GetLastError()
-
-
if cl.GetLastWarning():
-
e_info = 'WARNING: %s\n' % cl.GetLastWarning()
-
-
if res.has_key('words'):
-
for info in res['words']:
-
r_info += '\t\'%s\' found %d times in %d documents
' % (info['word'], info['hits'], info['docs'])
-
-
if res.has_key('matches'):
-
n = 1
-
s_result = '\nMatches:
'
-
-
import time
-
-
print res['matches']
-
-
for match in res['matches']:
-
attrsdump = ''
-
for attr in res['attrs']:
-
attrname = attr[0]
-
attrtype = attr[1]
-
value = match['attrs'][attrname]
-
if attrtype==SPH_ATTR_TIMESTAMP:
-
value = time.strftime ( '%Y-%m-%d %H:%M:%S', time.localtime(value) )
-
attrsdump = '%s, %s=%s' % ( attrsdump, attrname, value )
-
-
s_result += '%d. doc_id=%s, weight=%d%s
' % (n, match['id'], match['weight'], attrsdump)
-
n += 1
-
Cursor = con.cursor()
-
-
Cursor.execute('select post_content from wp_posts where id = %s' % match['id'])
-
re = Cursor.fetchall()
-
s_result += re[0][0]
-
s_result += '
'
-
-
return render.index(r_info=r_info, e_info=info, s_result=s_result)
-
-
if __name__ == '__main__':
-
app.run()
阅读(2637) | 评论(0) | 转发(0) |