非常にうまく機能する T-SQL NGrams を作成しました。使用方法の例については、コメント セクションに注意してください。
CREATE FUNCTION dbo.nGrams8K
(
@string VARCHAR(8000),
@n TINYINT,
@pad BIT
)
/*
Created by: Alan Burstein
Created on: 3/10/2014
Updated on: 5/20/2014 changed the logic to use an "inline tally table"
9/10/2014 Added some more code examples in the comment section
9/30/2014 Added more code examples
10/27/2014 Small bug fix regarding padding
Use: Outputs a stream of tokens based on an input string.
Works just like mdq.nGrams; see http://msdn.microsoft.com/en-us/library/ff487027(v=sql.105).aspx.
n-gram defined:
In the fields of computational linguistics and probability,
an n-gram is a contiguous sequence of n items from a given
sequence of text or speech. The items can be phonemes, syllables,
letters, words or base pairs according to the application.
To better understand N-Grams see: http://en.wikipedia.org/wiki/N-gram
*/
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
WITH
E1(n) AS (SELECT 1 FROM (VALUES (1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) t(n)),
E2(n) AS (SELECT 1 FROM E1 a CROSS JOIN E1 b),
iTally(n) AS
(
SELECT TOP (LEN(@string)+@n) ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
FROM E2 a CROSS JOIN E2 b
),
NewString(NewString) AS
(
SELECT REPLICATE(CASE @pad WHEN 0 THEN '' ELSE ' ' END,@n-1)+@string+
REPLICATE(CASE @pad WHEN 0 THEN '' ELSE ' ' END,@n-1)
)
SELECT TOP ((@n)+LEN(@string))
n AS [sequence],
SUBSTRING(NewString,n,@n) AS token
FROM iTally
CROSS APPLY NewString
WHERE n < ((@n)+LEN(@string));
/*
------------------------------------------------------------
-- (1) Basic Use
-------------------------------------------------------------
;-- (A)basic "string to table":
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',1,1);
-- (b) create "bi-grams" (pad bit off)
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',2,0);
-- (c) create "tri-grams" (pad bit on)
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',3,1);
-- (d) filter for only "tri-grams"
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',3,1)
WHERE len(ltrim(token)) = 3;
-- note the query plan for each. The power is coming from an index
-- also note how many rows are produced: len(@string+(@n-1))
-- lastly, you can trim as needed when padding=1
------------------------------------------------------------
-- (2) With a variable
------------------------------------------------------------
-- note, in this example I am getting only the stuff that has three letters
DECLARE @string varchar(20) = 'abcdefg',
@tokenLen tinyint = 3;
SELECT [sequence], token
FROM dbo.nGrams8K('abcdefg',3,1)
WHERE len(ltrim(token)) = 3;
GO
------------------------------------------------------------
-- (3) An on-the-fly alphabet (this will come in handy in a moment)
------------------------------------------------------------
DECLARE @alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ';
SELECT [sequence], token
FROM dbo.nGrams8K(@alphabet,1,0);
GO
------------------------------------------------------------
-- (4) Character Count
------------------------------------------------------------
DECLARE @string VARCHAR(100)='The quick green fox jumps over the lazy dog and the lazy dog just laid there.',
@alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ';
SELECT a.token, COUNT(b.token) ttl
FROM dbo.nGrams8K(@alphabet,1,0) a
LEFT JOIN dbo.nGrams8K(@string,1,0) b ON a.token=b.token
GROUP BY a.token
ORDER BY a.token;
GO
------------------------------------------------------------
-- (5) Locate the start position of a search pattern
------------------------------------------------------------
;-- (A) note these queries:
DECLARE @string varchar(100)='THE QUICK Green FOX JUMPED OVER THE LAZY DOGS BACK';
-- (i)
SELECT * FROM dbo.nGrams8K(@string,1,0) a;
-- (ii) note this query:
SELECT * FROM dbo.nGrams8K(@string,1,0) a WHERE [token]=' ';
-- (B) and now the word count (@string included for presentation)
SELECT @string AS string,
count(*)+1 AS words
FROM dbo.nGrams8K(@string,1,0) a
WHERE [token]=' '
GO
------------------------------------------------------------
-- (6) search for the number of occurances of a word
------------------------------------------------------------
DECLARE @string VARCHAR(100)='The quick green fox jumps over the lazy dog and the lazy dog just laid there.',
@alphabet VARCHAR(26)='ABCDEFGHIJKLMNOPQRSTUVWXYZ',
@searchString VARCHAR(100)='The';
-- (5a) by location
SELECT sequence-(LEN(@searchstring)) AS location,
token AS searchString
FROM dbo.nGrams8K(@string,LEN(@searchstring+' ')+1,0) b
WHERE token=@searchString;
-- (2b) get total
SELECT @string AS string,
@searchString AS searchString,
COUNT(*) AS ttl
FROM dbo.nGrams8K(@string,LEN(@searchstring+' ')+1,0) b
WHERE token=@searchString;
------------------------------------------------------------
-- (7) Special SubstringBefore and SubstringAfter
------------------------------------------------------------
-- (7a) SubstringBeforeSSI (note: SSI = substringIndex)
ALTER FUNCTION dbo.SubstringBeforeSSI
(
@string varchar(1000),
@substring varchar(100),
@substring_index tinyint
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
WITH get_pos AS
(
SELECT rn = row_number() over (order by sequence), substring_index = sequence
FROM dbo.nGrams8K(@string,len(@substring),1)
WHERE token=@substring
)
SELECT newstring = substring(@string,1,substring_index-len(@substring))
FROM get_pos
WHERE rn=@substring_index;
GO
DECLARE @string varchar(1000)='10.0.1600.22',
@searchPattern varchar(100)='.',
@substring_index tinyint = 3;
SELECT * FROM dbo.SubstringBeforeSSI(@string,@searchPattern,@substring_index);
GO
-- (7b) SubstringBeforeSSI (note: SSI = substringIndex)
ALTER FUNCTION dbo.SubstringAfterSSI
(
@string varchar(1000),
@substring varchar(100),
@substring_index tinyint
)
RETURNS TABLE
WITH SCHEMABINDING
AS
RETURN
WITH get_pos AS
(
SELECT rn = row_number() over (order by sequence), substring_index = sequence
FROM dbo.nGrams8K(@string,len(@substring),1)
WHERE token=@substring
)
SELECT newstring = substring(@string,substring_index+1,8000)
FROM get_pos
WHERE rn=@substring_index;
GO
DECLARE @string varchar(1000)='<notes id="1">blah, blah, blah</notes><notes id="2">More Notes</notes>',
@searchPattern varchar(100)='</notes>',
@substring_index tinyint = 1;
SELECT @string, *
FROM dbo.SubstringAfterSSI(@string,@searchPattern,@substring_index);
------------------------------------------------------------
-- (8) Strip non-numeric characters from a string
------------------------------------------------------------
-- (8a) create the function
ALTER FUNCTION StripNonNumeric_itvf(@OriginalText VARCHAR(8000))
RETURNS TABLE
--WITH SCHEMABINDING
AS
return
WITH ngrams AS
(
SELECT n = [sequence], c = token
FROM dbo.nGrams8K(@OriginalText,1,1)
),
clean_txt(CleanedText) AS
(
SELECT c+''
FROM ngrams
WHERE ascii(substring(@OriginalText,n,1)) BETWEEN 48 AND 57
FOR XML PATH('')
)
SELECT CleanedText
FROM clean_txt;
GO
-- (8b) use against a value or variable
SELECT CleanedText
FROM dbo.StripNonNumeric_itvf('value123');
-- (8c) use against a table
-- test harness:
IF OBJECT_ID('tempdb..#strings') IS NOT NULL DROP TABLE #strings;
WITH strings AS
(
SELECT TOP (100000) string = newid()
FROM sys.all_columns a CROSS JOIN sys.all_columns b
)
SELECT *
INTO #strings
FROM strings;
GO
-- query (returns 100K rows every 3 seconds on my pc):
SELECT CleanedText
FROM #strings
CROSS APPLY dbo.StripNonNumeric_itvf(string);
------------------------------------------------------------
-- (9) A couple complex String Algorithms
------------------------------------------------------------
-- (9a) hamming distance between two strings:
DECLARE @string1 varchar(8000) = 'xxxxyyyzzz',
@string2 varchar(8000) = 'xxxxyyzzzz';
SELECT string1 = @string1,
string2 = @string2,
hamming_distance = count(*)
FROM dbo.nGrams8K(@string1,1,0) s1
CROSS APPLY dbo.nGrams8K(@string2,1,0) s2
WHERE s1.sequence = s2.sequence
AND s1.token <> s2.token
GO
-- (9b) inner join between 2 strings
--(can be used to speed up other string metrics such as the longest common subsequence)
DECLARE @string1 varchar(100)='xxxx123yyyy456zzzz',
@string2 varchar(100)='xx789yy000zz';
WITH
s1(string1) AS
(
SELECT [token]+''
FROM dbo.nGrams8K(@string1,1,0)
WHERE charindex([token],@string2)<>0
ORDER BY [sequence]
FOR XML PATH('')
),
s2(string2) AS
(
SELECT [token]+''
FROM dbo.nGrams8K(@string2,1,0)
WHERE charindex([token],@string1)<>0
ORDER BY [sequence]
FOR XML PATH('')
)
SELECT string1, string2
FROM s1
CROSS APPLY s2;
------------------------------------------------------------
-- (10) Advanced Substring Metrics
------------------------------------------------------------
-- (10a) Identify common substrings and their location
DECLARE @string1 varchar(100) = 'xxx yyy zzz',
@string2 varchar(100) = 'xx yyy zz';
-- (i) review the two strings
SELECT str1 = @string1,
str2 = @string2;
-- (ii) the results
WITH
iTally AS
(
SELECT n
FROM dbo.tally t
WHERE n<= len(@string1)
),
distinct_tokens AS
(
SELECT ng1 = ng1.token, ng2 = ng2.token --= ltrim(ng1.token), ng2 = ltrim(ng2.token)
FROM itally
CROSS APPLY dbo.nGrams8K(@string1,n,1) ng1
CROSS APPLY dbo.nGrams8K(@string2,n,1) ng2
WHERE ng1.token=ng2.token
)
SELECT ss_txt = ng1,
ss_len = len(ng1),
str1_loc = charindex(ng1,@string1),
str2_loc = charindex(ng2,@string2)
FROM distinct_tokens
WHERE ng1<>'' AND charindex(ng1,@string1)+charindex(ng2,@string2)<>0
GROUP BY ng1, ng2
ORDER BY charindex(ng1,@string1), charindex(ng2,@string2), len(ng1);
-- (10b) Longest common substring function
-- (i) function
IF EXISTS
( SELECT * FROM INFORMATION_SCHEMA.ROUTINES
WHERE ROUTINE_SCHEMA='dbo' AND ROUTINE_NAME = 'lcss')
DROP FUNCTION dbo.lcss;
GO
CREATE FUNCTION dbo.lcss(@string1 varchar(100), @string2 varchar(100))
RETURNS TABLE
AS
RETURN
SELECT TOP (1) with ties token
FROM dbo.tally
CROSS APPLY dbo.nGrams8K(@string1,n,1)
WHERE n <= len(@string1)
AND charindex(token, @string2) > 0
ORDER BY len(token) DESC;
GO
-- (ii) example of use
DECLARE @string1 varchar(100) = '000xxxyyyzzz',
@string2 varchar(100) = '999xxyyyzaa';
SELECT string1 = @string1,
string2 = @string2,
token
FROM dbo.lcss(@string1, @string2);
*/
GO