Question : Count Times Words Appear

I am using MS SQL 2005.  I have data in a column that looks like:

Description
The product is green.   The
The Product is blue
Null
John is a good worker

The actual column has thousands of rows.  I wanted to please ask how I can write a query that can count the occurrences of each word.. The query's end result would look like:

NewColumn    Count
The                  3
Product            2
is                      3
Green               1
Blue                  1
John                 1
a                       1
Good                1
Worker             1

The new columns are counting the number of times each word appears.  The words are separated by a: single space; double space; period; period and a space after; period a double space after.

Thanks!

Answer : Count Times Words Appear

And assuming you use the above regular expression template, compiled a DLL.....


This will do some sampling for you.  If you have any questions about any of it, let me know.  I'm sure you will :)

It creates a database called tinker
switches to that DB
creates util_nums (populates)
enables CLR
adds the assemby and function
adds a delimited to table function I wrote that uses util_nums

Creates a WORDS table -- this you will need or something like it
Creates a YourTable table --replace YourTable with your actual table name throughout
Populates some sample data
creates a Word_YourTableMatches table -- this you will need or something like it.  it holdes matches of words to your ID field from your table.
Creates a procedure to process an ID and it's varchar field.
loops through the sample data calling up_ProcessWords
Displays data in a couple of different formats.
1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
109:
110:
111:
112:
113:
114:
115:
116:
117:
118:
119:
120:
121:
122:
123:
124:
125:
126:
127:
128:
129:
130:
131:
132:
133:
134:
135:
136:
137:
138:
139:
140:
141:
142:
143:
144:
145:
146:
147:
148:
149:
150:
151:
152:
153:
154:
155:
156:
157:
158:
159:
160:
set nocount on
go
create database tinker
go
use tinker
go
 
create table util_nums
(n      int not null
,constraint PK_util_nums primary key clustered (n)
)
go
with
       cte0 as (select 1 as c union all select 1), -- 2
       cte1 as (select 1 as c from cte0 a, cte0 b), -- 4
       cte2 as (select 1 as c from cte1 a, cte1 b), -- 16
       cte3 as (select 1 as c from cte2 a, cte2 b), -- 256
       cte4 as (select 1 as c from cte3 a, cte3 b), -- 65,536
       cte5 as (select 1 as c from cte4 a, cte4 b), -- 4,294,967,296 --four BILLION, not million
       nums as (select row_number() over (order by c) as n from cte5)
insert into util_nums(n)
       select n from nums 
where n between 1 and 100000
go
sp_configure 'clr enabled',1
go
reconfigure with override
go
 
CREATE ASSEMBLY UserDefinedFunctions from '#PATH TO YOUR DLL#\UserDefinedFunctions.dll' WITH PERMISSION_SET = SAFE
go
create function RegExReplace (@SearchIn nvarchar(max),@SearchFor nvarchar(max),@ReplaceWith nvarchar(max)) returns nvarchar(max) external name UserDefinedFunctions.UserDefinedFunctions.RegExReplace
GO
drop function [dbo].[fn_DelimitedToTable]
go
create function [dbo].[fn_DelimitedToTable](@DelimitedString nvarchar(max), @Delimiter nvarchar(32))
returns @Values TABLE
     (position      int not null  primary key clustered
     ,theValue      nvarchar(max)
     )
as
begin
insert into @Values
		select
			 n
			,substring(@delimiter + @DelimitedString + @delimiter, n + len(@delimiter), charindex(@delimiter, @delimiter + @DelimitedString + @delimiter, n + len(@delimiter)) - n - len(@delimiter)) as string_value
		from
			dbo.util_nums
		where
			n <= len(@delimiter + @DelimitedString + @delimiter) - len(@delimiter)
			and substring(@delimiter + @DelimitedString + @delimiter, n, len(@delimiter)) = @delimiter
 
return
end
go
 
create table Words
(WordId   int not null identity
,theWord  nvarchar(max) not null
,WordHash varchar(32)
,constraint PK_Words primary key clustered(WordId)
)
GO
create unique index UIDX_Words_WordHash on Words (WordHash)
GO
 
--Here's how you would process your data.
go
--This is SAMPLE DATA.
create table YourTable(Id int identity not null primary key clustered
     ,YourVARCHARField nvarchar(max) not null)
     
insert into YourTable(YourVARCHARField)
     select 'I like to ride my bike in the park.'
union select'I''d like to ride my bike in the park.'
union select'Some people call a motorcycle a BIKE'
union select'I like to call my bike a bike.'
 
go
 
create table Word_YourTableMatches (ID int not null,WordID int not null,position int not null)
 
GO
--drop procedure up_ProcessWords
create procedure up_ProcessWords (@ID int,@YourField nvarchar(max))
as
 
create table #Words
(position int,theWord nvarchar(max) not null,wordHash varchar(32) not null,wordId int null)
create clustered index tidx_Words on #Words(WordHash)
 
insert into #Words(position,theWord,WordHash)
select position,theValue,substring(master.dbo.fn_varbintohexstr(hashbytes('MD5',theValue)),3,32) from [dbo].[fn_DelimitedToTable](replace(dbo.RegExReplace(@YourField,'[^A-Za-z0-9'']','_'),'__','_'),'_')
where datalength(theValue)>0
 
reprocess:
update w
set wordId = wm.wordId
from #Words w
join Words wM
on w.wordhash = wm.wordhash
 
if exists (select null from #Words where WordID is null)
begin
     insert into Words (theWord,WordHash)
     select w.theWord,w.WordHash from #Words w
       left outer join Words wm
         on w.wordhash = wm.wordhash
     where wm.wordhash is null
     goto reprocess
end
--Clear all matches for that ID on process
delete from Word_YourTableMatches where ID = @ID
 
insert into Word_YourTableMatches(ID,WordId,position)
select @ID,WordID,position
from #Words
 
drop table #Words
go
 
 
--All of that, to get to here.  You run this while loop across your table and you are good to go.
 
declare @id int,@YourField nvarchar(max)
select top 1  @ID = id 
     ,@YourField = lower(YourVARCHARField)
from YourTable
while @ID is not null
begin
 
exec up_ProcessWords @ID,@YourField
 
     
select top 1  @ID = id 
     ,@YourField = lower(YourVARCHARField)
from YourTable
where ID > @ID
if @@Rowcount=0 break;
end
 
go
 
--shows where every word appears
select * from Word_YourTableMatches
 
--shows how many times, each word appears
select w.theWord,count(*) from Words w
join Word_YourTableMatches wt
on w.wordid = wt.wordid
group by w.theWord
 
--shows how many times, each word appears, Per ID
select w.theWord,wt.id,count(*) from Words w
join Word_YourTableMatches wt
on w.wordid = wt.wordid
group by w.theWord,wt.id
order by count(*) desc
 
select * from YourTable
Open in New Window Select All
Random Solutions  
 
programming4us programming4us