1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
109:
110:
111:
112:
113:
114:
115:
116:
117:
118:
119:
120:
121:
122:
123:
124:
125:
126:
127:
128:
129:
130:
131:
132:
133:
134:
135:
136:
137:
138:
139:
140:
141:
142:
143:
144:
145:
146:
147:
148:
149:
150:
151:
152:
153:
154:
155:
156:
157:
158:
159:
160:
|
set nocount on
go
create database tinker
go
use tinker
go
create table util_nums
(n int not null
,constraint PK_util_nums primary key clustered (n)
)
go
with
cte0 as (select 1 as c union all select 1), -- 2
cte1 as (select 1 as c from cte0 a, cte0 b), -- 4
cte2 as (select 1 as c from cte1 a, cte1 b), -- 16
cte3 as (select 1 as c from cte2 a, cte2 b), -- 256
cte4 as (select 1 as c from cte3 a, cte3 b), -- 65,536
cte5 as (select 1 as c from cte4 a, cte4 b), -- 4,294,967,296 --four BILLION, not million
nums as (select row_number() over (order by c) as n from cte5)
insert into util_nums(n)
select n from nums
where n between 1 and 100000
go
sp_configure 'clr enabled',1
go
reconfigure with override
go
CREATE ASSEMBLY UserDefinedFunctions from '#PATH TO YOUR DLL#\UserDefinedFunctions.dll' WITH PERMISSION_SET = SAFE
go
create function RegExReplace (@SearchIn nvarchar(max),@SearchFor nvarchar(max),@ReplaceWith nvarchar(max)) returns nvarchar(max) external name UserDefinedFunctions.UserDefinedFunctions.RegExReplace
GO
drop function [dbo].[fn_DelimitedToTable]
go
create function [dbo].[fn_DelimitedToTable](@DelimitedString nvarchar(max), @Delimiter nvarchar(32))
returns @Values TABLE
(position int not null primary key clustered
,theValue nvarchar(max)
)
as
begin
insert into @Values
select
n
,substring(@delimiter + @DelimitedString + @delimiter, n + len(@delimiter), charindex(@delimiter, @delimiter + @DelimitedString + @delimiter, n + len(@delimiter)) - n - len(@delimiter)) as string_value
from
dbo.util_nums
where
n <= len(@delimiter + @DelimitedString + @delimiter) - len(@delimiter)
and substring(@delimiter + @DelimitedString + @delimiter, n, len(@delimiter)) = @delimiter
return
end
go
create table Words
(WordId int not null identity
,theWord nvarchar(max) not null
,WordHash varchar(32)
,constraint PK_Words primary key clustered(WordId)
)
GO
create unique index UIDX_Words_WordHash on Words (WordHash)
GO
--Here's how you would process your data.
go
--This is SAMPLE DATA.
create table YourTable(Id int identity not null primary key clustered
,YourVARCHARField nvarchar(max) not null)
insert into YourTable(YourVARCHARField)
select 'I like to ride my bike in the park.'
union select'I''d like to ride my bike in the park.'
union select'Some people call a motorcycle a BIKE'
union select'I like to call my bike a bike.'
go
create table Word_YourTableMatches (ID int not null,WordID int not null,position int not null)
GO
--drop procedure up_ProcessWords
create procedure up_ProcessWords (@ID int,@YourField nvarchar(max))
as
create table #Words
(position int,theWord nvarchar(max) not null,wordHash varchar(32) not null,wordId int null)
create clustered index tidx_Words on #Words(WordHash)
insert into #Words(position,theWord,WordHash)
select position,theValue,substring(master.dbo.fn_varbintohexstr(hashbytes('MD5',theValue)),3,32) from [dbo].[fn_DelimitedToTable](replace(dbo.RegExReplace(@YourField,'[^A-Za-z0-9'']','_'),'__','_'),'_')
where datalength(theValue)>0
reprocess:
update w
set wordId = wm.wordId
from #Words w
join Words wM
on w.wordhash = wm.wordhash
if exists (select null from #Words where WordID is null)
begin
insert into Words (theWord,WordHash)
select w.theWord,w.WordHash from #Words w
left outer join Words wm
on w.wordhash = wm.wordhash
where wm.wordhash is null
goto reprocess
end
--Clear all matches for that ID on process
delete from Word_YourTableMatches where ID = @ID
insert into Word_YourTableMatches(ID,WordId,position)
select @ID,WordID,position
from #Words
drop table #Words
go
--All of that, to get to here. You run this while loop across your table and you are good to go.
declare @id int,@YourField nvarchar(max)
select top 1 @ID = id
,@YourField = lower(YourVARCHARField)
from YourTable
while @ID is not null
begin
exec up_ProcessWords @ID,@YourField
select top 1 @ID = id
,@YourField = lower(YourVARCHARField)
from YourTable
where ID > @ID
if @@Rowcount=0 break;
end
go
--shows where every word appears
select * from Word_YourTableMatches
--shows how many times, each word appears
select w.theWord,count(*) from Words w
join Word_YourTableMatches wt
on w.wordid = wt.wordid
group by w.theWord
--shows how many times, each word appears, Per ID
select w.theWord,wt.id,count(*) from Words w
join Word_YourTableMatches wt
on w.wordid = wt.wordid
group by w.theWord,wt.id
order by count(*) desc
select * from YourTable
|