2015-01-06 17:56:50| 分类: Postgres基础 | 标签: |举报 |字号大中小 订阅
[root@db1 soft_bak]# tar xvf scws-1.2.2.tar.bz2
[root@db1 soft_bak]# cd scws-1.2.2
[root@db1 scws-1.2.2]# ./configure
[root@db1 scws-1.2.2]# make install
[root@db1 scws-1.2.2]# ls /usr/local/include/scws/
charset.h crc32.h darray.h pool.h rule.h scws.h version.h xdb.h xdict.h xtree.h
备注:编译前需先配置环境变量。[root@db1 ~]# source /home/pg94/.bash_profile
[root@db1 scws-1.2.2]# SCWS_HOME=/usr/local make && make install
pg94@db1-> ll /opt/pgsql_9.4beta3/share/extension/zhparser*
-rw-r--r-- 1 root root 606 Jan 5 20:52 /opt/pgsql_9.4beta3/share/extension/zhparser--1.0.sql
-rw-r--r-- 1 root root 133 Jan 5 20:52 /opt/pgsql_9.4beta3/share/extension/zhparser.control
-rw-r--r-- 1 root root 471 Jan 5 20:52 /opt/pgsql_9.4beta3/share/extension/zhparser--unpackaged--1.0.sql
pg94@db1-> psql francs
psql (9.4beta3)
Type "help" for help.
francs=# create extension zhparser;
CREATE EXTENSION
--make test configuration using parser
francs=# CREATE TEXT SEARCH CONFIGURATION testzhcfg (PARSER = zhparser);
CREATE TEXT SEARCH CONFIGURATION
--add token mapping
francs=# ALTER TEXT SEARCH CONFIGURATION testzhcfg ADD MAPPING FOR n,v,a,i,e,l WITH simple;
ALTER TEXT SEARCH CONFIGURATION
francs=> select to_tsvector('testzhcfg','南京市长江大桥');
to_tsvector
-------------------------
'南京市':1 '长江大桥':2
(1 row)
francs=> select to_tsvector('testzhcfg','山坑休闲钓鱼山庄');
to_tsvector
-------------------------------------
'休闲':2 '山坑':1 '山庄':4 '钓鱼':3
(1 row)
francs=> SELECT to_tsvector('testzhcfg','“今年保障房新开工数量虽然有所下调,但实际的年度在建规模以及竣工规模会超以往年份,相对应的对资金的需求也会创历>史纪录。”陈国强说。在他看来,与2011年相比,2012年的保障房建设在资金配套上的压力将更为严峻。');
to_tsvector
--------------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------------
---------------------------------
'2011':27 '2012':29 '上':35 '下调':7 '严峻':37 '会':14 '会创':20 '保障':1,30 '压力':36 '史':21 '国强':24 '在建':10 '实际':8 '对应':17 '年份':16 '
年度':9 '开工':4 '房':2 '房建':31 '数量':5 '新':3 '有所':6 '相比':28 '看来':26 '竣工':12 '纪录':22 '规模':11,13 '设在':32 '说':25 '资金':18,33 '超
':15 '配套':34 '陈':23 '需求':19
(1 row)
备注: 从上面几个例子来看,分词效果较好。
--测试 to_tsquery
francs=> SELECT to_tsquery('testzhcfg', '保障房资金压力');
to_tsquery
---------------------------------
'保障' & '房' & '资金' & '压力'
(1 row)
IBM BladeCenter HS22
8 核8G
CentOS release 6.5
PostgreSQL 9.4beta3
zhparser 1.0
francs=> select count(*) from tbl_store ;
count
---------
2994433
(1 row)
备注:两边带有 % 的 like 走全表扫,效率很低。francs=> explain analyze select name,type,city from tbl_store where name like '%美猴王批发%';
QUERY PLAN
-------------------------------------------------------------------------------------------------------------
Seq Scan on tbl_store (cost=0.00..106601.41 rows=282 width=44) (actual time=0.018..870.666 rows=2 loops=1)
Filter: ((name)::text ~~ '%美猴王批发%'::text)
Rows Removed by Filter: 2994431
Planning time: 0.964 ms
Execution time: 870.706 ms
(5 rows)
francs=> create index idx_gin_tbl_store_name on tbl_store using gin(to_tsvector('testzhcfg',name));
CREATE INDEX
Time: 48434.917 ms
备注: 查询时间 0.398 ms ,效率非常高。francs=> explain analyze select name,type,city from tbl_store where to_tsvector('testzhcfg',name) @@ to_tsquery('testzhcfg','美猴王批发');
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on tbl_store (cost=28.58..322.30 rows=75 width=44) (actual time=0.352..0.356 rows=2 loops=1)
Recheck Cond: (to_tsvector('testzhcfg'::regconfig, (name)::text) @@ '''美猴王'' & ''批发'''::tsquery)
Heap Blocks: exact=2
-> Bitmap Index Scan on idx_gin_tbl_store_name (cost=0.00..28.56 rows=75 width=0) (actual time=0.342..0.342 rows=2 loops=1)
Index Cond: (to_tsvector('testzhcfg'::regconfig, (name)::text) @@ '''美猴王'' & ''批发'''::tsquery)
Planning time: 0.168 ms
Execution time: 0.398 ms
(7 rows)
评论