Stata Code: wordfreq
/* NOTES:
Should also test to see whether filename args exist.
Filename wildcard * ? characters would be nice
How to eliminate quote characters?
Problems if filename extensions end in something besides .txt
(e.g. .asc)
*/
* version 7.0
program define wordfreq, rclass
display in text " Starting WORDFREQ ..."
while "`1'" ~= "" {
quietly infile str30 word using `1', automatic clear
cleanwords word
local tempname = subinstr("`1'",".txt","",.)
local tempname = "t" + "`tempname'"
quietly gen `tempname' = 1
display in text " `1' --> `tempname'"
quietly collapse (count) `tempname', by(word)
capture confirm file wc_temp.dta
if _rc==0 {
quietly sort word
quietly merge word using wc_temp
quietly drop _merge
}
quietly sort word
quietly save wc_temp, replace
macro shift
}
erase wc_temp.dta
display in text " ...cleaning up..."
quietly mvencode _all, mv(0)
quietly egen tot = rsum(t*)
quietly gsort -tot
quietly drop tot
display in text " Finished."
end
program define cleanwords, rclass
args word
confirm variable `word'
quietly replace `word' = trim(`word')
quietly replace `word' = lower(`word')
quietly /* remove all of the following characters */
quietly replace `word' = subinstr(`word',"(","",.)
quietly replace `word' = subinstr(`word',")","",.)
quietly replace `word' = subinstr(`word',";","",.)
quietly replace `word' = subinstr(`word',":","",.)
/* replace `word' = subinstr(`word',"'","",.) */
quietly replace `word' = subinstr(`word',",","",.)
quietly replace `word' = subinstr(`word',"*","",.)
quietly replace `word' = subinstr(`word',"^","",.)
quietly replace `word' = subinstr(`word',"%","",.)
quietly replace `word' = subinstr(`word',"{","",.)
quietly replace `word' = subinstr(`word',"}","",.)
quietly replace `word' = subinstr(`word',"[","",.)
quietly replace `word' = subinstr(`word',"]","",.)
quietly replace `word' = subinstr(`word',"....","",.)
quietly replace `word' = subinstr(`word',"...","",.)
quietly replace `word' = subinstr(`word',"..","",.)
quietly replace `word' = subinstr(`word',"`","",.)
quietly replace `word' = subinstr(`word',"&","",.)
/* remove these characters only if at the end of a word */
quietly replace `word' = subinstr(`word',".","",.) if index(`word',".")==(length(`word'))
quietly replace `word' = subinstr(`word',"?","",.) if index(`word',"?")==(length(`word'))
quietly replace `word' = subinstr(`word',"!","",.) if index(`word',"!")==(length(`word'))
quietly replace `word' = subinstr(`word',"'","",.) if index(`word',"'")==1 | index(`word',"'")==(length(`word'))
quietly replace `word' = subinstr(`word',"`","",.) if index(`word',"'")==1
quietly replace `word' = subinstr(`word',"-","",.) if index(`word',"-")==1 | index(`word',"-")==(length(`word'))
quietly replace `word' = subinstr(`word',"_","",.) if index(`word',"_")==1 | index(`word',"_")==(length(`word'))
/* remove these characters if they stand alone */
quietly drop if `word' == ""
quietly drop if `word' == "%"
quietly drop if `word' == "+"
quietly drop if `word' == "-"
quietly drop if `word' == "!"
quietly drop if `word' == "/"
quietly drop if `word' == "@"
quietly drop if `word' == "~"
quietly drop if `word' == "&"
quietly sort `word'
quietly splitnc `word'
quietly drop `word'
capture confirm variable `word'2
if _rc==0 {
quietly stack `word'*, into(`word') clear
quietly drop if `word'==""
quietly drop _stack
}
else quietly rename `word'1 `word'
end
/****
program define cleanwords, rclass;
args wrd;
display "in cleanwords";
confirm variable `wrd';
quietly replace `wrd' = trim(`wrd');
quietly replace `wrd' = lower(`wrd');
quietly /* remove all of the following characters */
quietly replace `wrd' = subinstr(`wrd',"(","",.);
quietly replace `wrd' = subinstr(`wrd',")","",.);
quietly replace `wrd' = subinstr(`wrd',";","",.);
quietly replace `wrd' = subinstr(`wrd',":","",.);
/* replace `wrd' = subinstr(`wrd',"'","",.); */
quietly replace `wrd' = subinstr(`wrd',",","",.);
quietly replace `wrd' = subinstr(`wrd',"*","",.);
quietly replace `wrd' = subinstr(`wrd',"^","",.);
quietly replace `wrd' = subinstr(`wrd',"%","",.);
quietly replace `wrd' = subinstr(`wrd',"{","",.);
quietly replace `wrd' = subinstr(`wrd',"}","",.);
quietly replace `wrd' = subinstr(`wrd',"[","",.);
quietly replace `wrd' = subinstr(`wrd',"]","",.);
quietly replace `wrd' = subinstr(`wrd',"....","",.);
quietly replace `wrd' = subinstr(`wrd',"...","",.);
quietly replace `wrd' = subinstr(`wrd',"..","",.);
quietly replace `wrd' = subinstr(`wrd',"`","",.);
quietly replace `wrd' = subinstr(`wrd',"&","",.);
/* remove these characters only if at the end of a word */
quietly replace `wrd' = subinstr(`wrd',".","",.) if index(`wrd',".")==(length(`wrd'));
quietly replace `wrd' = subinstr(`wrd',"?","",.) if index(`wrd',"?")==(length(`wrd'));
quietly replace `wrd' = subinstr(`wrd',"!","",.) if index(`wrd',"!")==(length(`wrd'));
quietly replace `wrd' = subinstr(`wrd',"'","",.)
if index(`wrd',"'")==1 | index(`wrd',"'")==(length(`wrd'));
quietly replace `wrd' = subinstr(`wrd',"`","",.) if index(`wrd',"'")==1;
quietly replace `wrd' = subinstr(`wrd',"-","",.)
if index(`wrd',"-")==1 | index(`wrd',"-")==(length(`wrd'));
quietly replace `wrd' = subinstr(`wrd',"_","",.)
if index(`wrd',"_")==1 | index(`wrd',"_")==(length(`wrd'));
/* remove these characters if they stand alone */
quietly drop if `wrd' == "";
quietly drop if `wrd' == "%";
quietly drop if `wrd' == "+";
quietly drop if `wrd' == "-";
quietly drop if `wrd' == "!";
quietly drop if `wrd' == "/";
quietly drop if `wrd' == "@";
quietly drop if `wrd' == "~";
quietly drop if `wrd' == "&";
quietly sort `wrd';
quietly splitnc `wrd';
quietly drop `wrd';
capture confirm variable `wrd'2;
if _rc==0 {
quietly stack `wrd'*, into(`wrd') clear;
quietly drop if `wrd'=="";
quietly drop _stack;
}; else
quietly rename `wrd'1 `wrd';
end;
***/