ピリオドは通常、単語間のピリオドとして扱われるため、全文検索では問題になります。ピリオドを別の文字に置き換えることが解決策であり、アプリケーションへの最小限の変更でそれを行うことができます。これは、問題を特定して解決策にたどり着くまでを順を追って説明するかなり長いスクリプトです。回避策が必要な場合は、「短い回答」バージョンにスキップできます。
フルテキスト スキーマのセットアップ
SET ANSI_NULLS ON
SET QUOTED_IDENTIFIER ON
SET ANSI_PADDING ON
CREATE TABLE [dbo].[FT_Test](
[id] [int] IDENTITY(1,1) NOT NULL,
[TextData] [varchar](max) NOT NULL,
CONSTRAINT [PK_FT_Test] PRIMARY KEY CLUSTERED
(
[id] ASC
) ON [PRIMARY]
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO
CREATE FULLTEXT CATALOG [ft_default] WITH ACCENT_SENSITIVITY = ON
CREATE FULLTEXT INDEX ON [dbo].[FT_Test] KEY INDEX [PK_FT_Test] ON ([ft_default])
WITH (CHANGE_TRACKING AUTO)
ALTER FULLTEXT INDEX ON [dbo].[FT_Test] ADD ([TextData])
ALTER FULLTEXT INDEX ON [dbo].[FT_Test] ENABLE
SQL Server のバージョンを確認する
このスクリプトは Sql Server 2012 を中心に設計されていますが、2008 にも適用する必要があります。ワードブレーカーは、Sql 2008 と Sql 2012 の間で大幅に変更されました (少なくとも言語 ID 1033 - 米国英語)。主な意味は、1-2-3 が 1、2、3、1-2-3、nn1、nn2、nn3 に分割されることです (1-2-3 を含む新しい)
go
PRINT 'Version 14.0.4763.1000 is Sql Server 2012'
EXEC master.sys.sp_help_fulltext_system_components @component_type = 'wordbreaker', @param=1033
Sql Server はキーワードをセミインテリジェントに解析します
残念ながら、それは現在私たちに不利に働いています。同じデータが複数回保存され、検索結果も悪いため、肥大化します。
go
DELETE FROM ft_test
INSERT INTO dbo.FT_Test ( TextData )
VALUES
( '1.1.1 5.2.1, 7.1.1.34.69; 12.11.10.9.8 4.6 7/13/2013 15,456.345')
WAITFOR DELAY '00:00:05'
--Wait 5 seconds for ft index to populate
SELECT ft_test.*, ft_content.display_term, ft_content.occurrence_count
FROM sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('ft_test')) ft_content
INNER JOIN dbo.FT_Test ON document_id = id
ORDER BY id, keyword
--Notice what is returned,the two digit numbers are identified, but the 1 digit numbers aren't (due to default stoplist).
--Also, note that they are treated as distinct items and are broken up. 4.6 does show up because it is a decimal number.
--the nn* display_terms are standardized numeric (also, note how the date got standardized as dd20120713 in addition to 7/13/2013)
SELECT *
FROM ft_test
WHERE CONTAINS ( *, '"5.2*"' ) -- No results, 5 and 2 are in default stopword list.
SELECT *
FROM ft_test
WHERE CONTAINS ( *, '"12.11*"' ) -- periods are hard breaks, so this doesn't work either
1 桁のインデックスを作成するカスタマイズされたストップリストを作成する
通常、全文検索に関して 1 桁はあまり価値がありませんが、必要です。デフォルトのシステム ストップリストをベースとして使用します。
CREATE FULLTEXT STOPLIST [no_numbers]
FROM SYSTEM STOPLIST
AUTHORIZATION [dbo];
go
ALTER FULLTEXT STOPLIST [no_numbers] DROP '0' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '1' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '2' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '3' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '4' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '5' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '6' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '7' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '8' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '9' LANGUAGE 'English';
GO
新しいストップリストに基づいてフルテキスト インデックスを再作成します
これは一部の助けになり、私たちがなりたい場所に近づきます。
DROP FULLTEXT INDEX ON dbo.FT_Test
CREATE FULLTEXT INDEX ON [dbo].[FT_Test] ( TextData) KEY INDEX [PK_FT_Test] ON ([ft_default])
WITH (CHANGE_TRACKING AUTO, STOPLIST = [no_numbers])
WAITFOR DELAY '00:00:05'
--Wait 5 seconds for ft index to populate
SELECT ft_test.*, ft_content.display_term, ft_content.occurrence_count
FROM sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('ft_test')) ft_content
INNER JOIN dbo.FT_Test ON document_id = id
ORDER BY id, keyword
--Progress, now single digits are showing up
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1 1 14.123' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '5.2.1.1.14' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.1.3 ' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '2.2.3.3' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '6.0 88.00.00' )
--This works in the first 3 cases, but doesn't work for 2.2
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '1.1.1*' ) ct ON ct.[key] = ft_test.id
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '2.2.3.3*' ) ct ON ct.[key] = ft_test.id
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '2.2.3*' ) ct ON ct.[key] = ft_test.id
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '2.2*' ) ct ON ct.[key] = ft_test.id
--Double quoting makes it match more stuff, but still is broken.
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"1.1.1*"' ) ct ON ct.[key] = ft_test.id
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"1.1*"' ) ct ON ct.[key] = ft_test.id
間違いなく近づいていますが、上記の 2.2* のケースは厄介です。10進数として解析されています:
declare @stoplistId INT
SET @stoplistid = (SELECT stoplist_id FROM sys.fulltext_stoplists WHERE name ='no_numbers')
SELECT * FROM sys.dm_fts_parser('"1.1.1*"', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('1.1.1*', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('"1.1*"', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('1.1*', 1033,@stoplistId, 0)
可能性のある区切り文字として、他にどのような文字がありますか?
いくつか試してみて、飛び出すものがないか見てみましょう。「XXXDOTXXX」のようなものを試すこともできますが、可能であれば 1 文字にしておく方がはるかにクリーンです。
INSERT INTO dbo.FT_Test ( TextData )
VALUES
( '1-1-1 2@2@2 3#3#3 4$4$4 5%5%5 6^6^6 7&7&7 8*8*8 9=9=9 10_10_10 11|11|11 12:12:12 12:12:12:12 13"13"13" 14~14~14 15`15`15')
SELECT ft_test.*, ft_content.display_term, ft_content.occurrence_count
FROM sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('ft_test')) ft_content
INNER JOIN dbo.FT_Test ON document_id = id WHERE textdata LIKE '1-1-1%'
ORDER BY id, keyword
DELETE FROM ft_test WHERE textdata LIKE '%3#3#3%'
ハイフン、アンダースコア、または逆引用符が機能するようです。これらをさらに詳しく調べてみましょう。
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '3`3`3`4 1`2`3 6`1`2`3`4 ' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '5-5-5-6 2-3-4 6-1-2-3-4-5' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '6_6_6_7 3_4_5 7_1_2_3_4_5_6' )
SELECT ft_test.*, ft_content.display_term, ft_content.occurrence_count
FROM sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('ft_test')) ft_content
INNER JOIN dbo.FT_Test ON document_id = id WHERE textdata LIKE '3`3%' OR TextData LIKE '5-5%' OR textdata LIKE '6_6%'
ORDER BY id, keyword
--Hyphen isn't looking good now, it gets stored 3 times, as numbers, as individual digits and as a full string.
--Let's try backquote:
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"3`3*"' ) ct ON ct.[key] = ft_test.id
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"1`2`3*"' ) ct ON ct.[key] = ft_test.id
-- these match anything with a single 6... not good...
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"6`*"' ) ct ON ct.[key] = ft_test.id
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '6`*' ) ct ON ct.[key] = ft_test.id
--the backquote is getting dropped when it's parsed
declare @stoplistId INT
SET @stoplistid = (SELECT stoplist_id FROM sys.fulltext_stoplists WHERE name ='no_numbers')
SELECT * FROM sys.dm_fts_parser('"6`*"', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('6`*', 1033,@stoplistId, 0)
--Underscore is just about all we have left.
declare @stoplistId INT
SET @stoplistid = (SELECT stoplist_id FROM sys.fulltext_stoplists WHERE name ='no_numbers')
SELECT * FROM sys.dm_fts_parser('"2_*"', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('2_*', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('2_2*', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('2_2*', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('2_2_*', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('2_2_*', 1033,@stoplistId, 0)
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '6_6_66_7 77_6_6_6' )
--
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"6_*"' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '6_*' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '6_6*' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '2_3*' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '6_6*' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '6_6_6_7*' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"6_6_6_7*"' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"6_6_6_*"' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"6_6_6*"' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"6_6_*"' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
簡潔な答え
ピリオドをアンダースコアに置き換えます
アンダースコアは行く方法です。句読点としてではなく、文字として扱われます。Sql Server は、計算列に全文索引を作成できます。これにより、式を使用してデータを「修正」し、インデックスを作成し、ストレージを追加せずに (そして最小限のオーバーヘッドで) クエリを実行できます。「1.2.3」ではなく「1_2_3」を照会するようにアプリケーションを変更する必要があります。
--naive implementation
ALTER TABLE ft_test ADD [TextData_FT1] AS ([textdata]+' '+replace([TextData],'.','_'))
--strip all characters. You can customize to get pull out only the paragraph numbers
ALTER TABLE ft_test ADD [TextData_FT2] AS (REPLACE(REPLACE(
REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(
REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(
REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(UPPER([TextData])
,'A', ' '),'B', ' '),'C', ' '),'D', ' '),'E', ' '),'F', ' '),
'G', ' '),'H', ' '),'I', ' '),'J', ' '),'K', ' '),'L', ' '),'M', ' '),'N', ' '),
'O', ' '),'P', ' '),'Q', ' '),'R', ' '),'S', ' '),'T', ' '),'U', ' '),'V', ' '),
'W', ' '),'X', ' '),'Y', ' '),'Z', ' '), '.','_') , ' ',' ')
)
--Add computed columns to FT index
ALTER FULLTEXT INDEX ON [dbo].[FT_Test] ADD ([TextData_FT1])
ALTER FULLTEXT INDEX ON [dbo].[FT_Test] ADD ([TextData_FT2])
DELETE FROM dbo.FT_Test
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1 This is the chapter title' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.1 Section heading' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.1.1 paragraph 1 is very interesting' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.1.2 paragraph two is better' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.2 Another Section' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.2.1 Foobar qwerty loren ipsum' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.2.2 Foobar2 qwerty2 loren ipsum 12 items ' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.2.12 Foobar2 qwerty2 loren ipsum ' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '2.2.17 sql server is great. ' )
--naive implementation
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft1, '"1_1*"')
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft1, '1*')
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft1, '2*') --
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft1, '"1_1_2*"')
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft1, '1_1_2*')
--only index the paragraph identifiers
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft2, '"1_1*"')
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft2, '1*')
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft2, '2*') --
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft2, '"1_1_2*"')
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft2, '1_1_2*')