Stata Code: phrasefreq
/* NOTES:
Usage: Phrasecount n text1.txt text2.txt ...
where n is the number of phrases to use
*/
* version 7.0
#delimit ;
program define phrasefreq, rclass;
local phrlen = `1';
display in text " Starting PHRASEFREQ (length=`phrlen')...";
macro shift;
while "`1'" ~= "" {;
quietly infile str30 phrase using `1', automatic clear;
cleanwords phrase;
stackphrases `phrlen' phrase;
local tempname = subinstr("`1'",".txt","",.);
local tempname = "t" + "`tempname'";
quietly gen `tempname' = 1;
display in text " `1' --> `tempname'";
quietly collapse (count) `tempname', by(phrase);
capture confirm file wc_temp.dta;
if _rc==0 {;
quietly sort phrase;
quietly merge phrase using wc_temp;
quietly drop _merge;
};
quietly sort phrase;
quietly save wc_temp, replace;
macro shift;
};
erase wc_temp.dta;
display in text " ...cleaning up...";
quietly mvencode _all, mv(0);
quietly egen tot = rsum(t*);
quietly gsort -tot;
quietly drop tot;
tempvar tupletemp;
quietly gen ntuple = 3 if index( substr(phrase,index(phrase,"_")+1,.), "_");
quietly gen tupletemp = 1 if index(phrase, "_");
quietly replace ntuple=1 if tupletemp~=1;
quietly recode ntuple .=2;
quietly drop tupletemp;
quietly display in text " Finished.";
end;
program define cleanwords, rclass;
args word;
confirm variable `word';
quietly replace `word' = trim(`word');
quietly replace `word' = lower(`word');
quietly /* remove all of the following characters */
quietly replace `word' = subinstr(`word',"(","",.);
quietly replace `word' = subinstr(`word',")","",.);
quietly replace `word' = subinstr(`word',";","",.);
quietly replace `word' = subinstr(`word',":","",.);
/* replace `word' = subinstr(`word',"'","",.); */
quietly replace `word' = subinstr(`word',",","",.);
quietly replace `word' = subinstr(`word',"*","",.);
quietly replace `word' = subinstr(`word',"^","",.);
quietly replace `word' = subinstr(`word',"%","",.);
quietly replace `word' = subinstr(`word',"{","",.);
quietly replace `word' = subinstr(`word',"}","",.);
quietly replace `word' = subinstr(`word',"[","",.);
quietly replace `word' = subinstr(`word',"]","",.);
quietly replace `word' = subinstr(`word',"....","",.);
quietly replace `word' = subinstr(`word',"...","",.);
quietly replace `word' = subinstr(`word',"..","",.);
quietly replace `word' = subinstr(`word',"`","",.);
/* remove these characters only if at the end of a word */
quietly replace `word' = subinstr(`word',".","",.) if index(`word',".")==(length(`word'));
quietly replace `word' = subinstr(`word',"?","",.) if index(`word',"?")==(length(`word'));
quietly replace `word' = subinstr(`word',"!","",.) if index(`word',"!")==(length(`word'));
quietly replace `word' = subinstr(`word',"'","",.) if index(`word',"'")==1;
quietly replace `word' = subinstr(`word',"'","",.) if index(`word',"'")==(length(`word'));
quietly replace `word' = subinstr(`word',"`","",.) if index(`word',"'")==1;
/* remove these characters if they stand alone */
quietly drop if `word' == "";
quietly drop if `word' == "%";
quietly drop if `word' == "+";
quietly drop if `word' == "-";
quietly drop if `word' == "!";
quietly drop if `word' == "/";
quietly drop if `word' == "@";
quietly drop if `word' == "~";
quietly drop if `word' == "&";
quietly splitnc `word';
quietly drop `word';
capture confirm variable `word'2;
if _rc==0 {;
quietly stack `word'*, into(`word') clear;
quietly drop if `word'=="";
quietly drop _stack;
}; else
quietly rename `word'1 `word';
end;
program define stackphrases, rclass;
args len word;
confirm variable `word';
* if `len' > 3 {
* display "Error: stackphrases currently only supports phrase length max=3.";
* };
local i = 2;
while `i' <= `len' {;
quietly gen str80 `word'`i' = `word'[_n] + "_" + `word'[_n+1];
local j = 3;
while `j' <= `i' {;
quietly replace `word'`i' = `word'`i' + "_" + `word'[_n+`j'-1];
local j = `j' + 1;
};
local todrop = -1*`i'+1;
quietly replace `word'`i' = "" in `todrop'/l;
local i = `i' + 1;
};
quietly compress `word'*;
quietly stack `word'*, into(`word') clear;
quietly drop _stack;
quietly replace `word' = subinstr(`word',"_","",.)
if index(`word',"'")==(length(`word')) | index(`word',"'")==1;
quietly drop if `word' == "";
end;