
capture program drop ldagibbs

program ldagibbs

	syntax varlist(max=1) [, Topics(integer 10)  Alpha(real 0.25) Beta(real 0.1)  SEed(integer 0) BURNin_iter(integer 500) SAMples(integer 10) SAMPLing_iter(integer 50) MIn_char(integer 0) NAme_new_var(string) STOPwords(string) NORMalize LIKElihood MAt_save PAth(string) ]
	
	display "*************************************"
	display "********* Parameters for LDA *********"
	display "*************************************"
	display "Number of Topics: " `topics',
	display "Prior Alpha: "`alpha' 
	display "Prior Beta: "`beta' 
	display "Number of Iterations: "`burnin_iter' 
	display "Number of Samples: "`samples' 
	display "Iterations between Samples: "`sampling_iter'
	display "Seed: "`seed' 
	display "Minimal Word Length: "`min_char'
	display "*************************************"
	display "                                     " 
	

	
	
	if `"`normalize'"' == `""' { 
		local norm = 0
	}
	else {
		local norm = 1
	}
	
	if `"`likelihood'"' == `""' { 
		local like = 0
	}
	else {
		local like = 1
	}
	
	if `"`mat_save'"' == `""' { 
		local mat_s = 0
	}
	else {
		local mat_s = 1
	}
	
	mata:ldagibbs_mata( "`varlist'", `topics', `alpha' , `beta' ,  `burnin_iter' , `samples' , `sampling_iter', `seed' ,  `min_char', "`name_new_var'" , "`stopwords'", `norm', `like', `mat_s', "`path'" )
	
end


capture mata mata drop ldagibbs_mata()
mata:
function ldagibbs_mata( string scalar words, real scalar T, real scalar ALPHA, real scalar BETA,  real scalar NN,  real scalar samples,  real scalar sampling_iter, real scalar seed , real scalar min_char , string scalar name_new_var , string scalar stopwords , real scalar normalize , real scalar likelihood  , real scalar mat_save,  string scalar path ) {
	/* 1) Import Text Strings from Stata and first tokenize them into matrix */
	/*    Afterwards the code loops over all documents and tokens in the matrix and creates a word list.*/
	/*    This word list contains first a number and the word associate with that number. */
	/*   Using this word list a vector containing the number for the word tokens and */
	/*   a vector containing the document assignments for the word tokens is created.	*/
	text=st_sdata(. , words) /*text strings of documents*/
   	word = J(0,1,.)
	doc = J(0,1,.)
	word_list = J(0,1,"")
	
	rseed(seed)	
	"*************************************"
	"******** Preparing Documents ********"
	"*************************************"
	"                                     "
	if (stopwords!="") {
		stopwords = tokens(strlower(stopwords))
		
		for (document=1 ; document<=rows(text) ; document++){
			if (mod(document,500) == 0){
				"Processing Document:" 
				document /*Display Current Document*/
			}
		
			text_split = tokens(strlower(text[document]))   /*cast the string into lowercase and tokenizes the strings into words*/
			for (token=1 ; token<=cols(text_split) ; token++){
			
				/* remove stopwords from text */					
				if (sum(strmatch(text_split[token] , stopwords)) == 0) {
				
					/* remove words shorter the min_char from text */				
					if (strlen(text_split[token]) >= min_char) {
						
						/* if first word token append to word list */
						if (length(word_list)==0) { 					
							word_list = word_list \   text_split[token]
						}			
						
						if (sum(word_list :== text_split[token]  ) > 0){ 
							/*check if token is already in word list*/
							/* if yes append word token and document token to word and document vector*/			
							word = word \ selectindex(word_list:== text_split[token]) /*number of Word Tokens only needed for loop over all elements*/
							doc = doc \ document /*document belonging to word token*/
						}			

						if (sum(word_list :== text_split[token]  ) == 0) {
								word_list = word_list \ text_split[ token] /* add word token to word list*/
								word = word \ selectindex(word_list:== text_split[token]) /*number of Word Tokens only needed for loop over all elements*/
								doc = doc \ document /*document belonging to word token*/
						}
						
					}
				}
			}
		}
	}
	
	if (stopwords=="") {
		for (document=1 ; document<=rows(text) ; document++){
			if (mod(document,500) == 0){
				"Processing Document:" 
				document /*Display Current Document*/
			}
		
			text_split = tokens(strlower(text[document]))   /*cast the string into lowercase and tokenizes the strings into words*/
			for (token=1 ; token<=cols(text_split) ; token++){
			
				/* remove words shorter the min_char from text */				
				if (strlen(text_split[token]) >= min_char) {
					
					/* if first word token append to word list */
					if (length(word_list)==0) { 					
						word_list = word_list \   text_split[token]
					}			
					
					if (sum(word_list :== text_split[token]  ) > 0){ 
						/*check if token is already in word list*/
						/* if yes append word token and document token to word and document vector*/			
						word = word \ selectindex(word_list:== text_split[token]) /*number of Word Tokens only needed for loop over all elements*/
						doc = doc \ document /*document belonging to word token*/
					}			

					if (sum(word_list :== text_split[token] ) == 0) {
							word_list = word_list \ text_split[ token] /* add word token to word list*/
							word = word \ selectindex(word_list:== text_split[token]) /*number of Word Tokens only needed for loop over all elements*/
							doc = doc \ document /*document belonging to word token*/
					}
					
				}
			}
		}
	}
	"*************************************"
	"**** Initializing Gibbs Sampler *****"
	"*************************************"
	"                                     "
	/* 2) Randomly Assign Words to topics and randomly sort word doc and topic assignment*/
	/*    so that the update order for the Gibbs Sampler is random*/
	topic_ass = runiformint(length(word), 1, 1, T)  /*random topic assignment*/
	order = runiform(length(word),1 ) /*random order of tokens*/
	
	word = sort((word ,order), 2)[.,1]
	doc = sort((doc ,order), 2)[.,1]
	topic_ass = sort((topic_ass ,order), 2)[.,1]


	/* 3) Initialize Variables for the Gibbs Sampler */	
	W=rows(word) /*number of Word Tokens only needed for loop over all elements*/
	n_voc = 0
	n_docs = 0
	n_voc=max(word) /*number of unique words in vocabulary*/
	n_docs=max(doc) /*number of documents in data*/	
	WBETA = (n_voc*BETA)  /*sum of betas*/
	wi=0
	di=0
	topic=0

	probs =J(T,1,0) 
	ztot=J(T,1,0) /*vector for total number of topic assignment*/
	wp=J(T*n_voc,1,0)  /* vector for assignments of vocabulary to topics*/
	dp=J(T*n_docs,1,0)  /* vector for assignments of documents to topics*/
	
	 for (i=1; i<=W; i++) {
			  wi = word[ i ]
			  di = doc[ i ]
			  topic = topic_ass[i]
			  wp[ (wi-1)*(T) + topic ] = wp[ (wi-1)*(T)  + topic ] + 1
			  dp[ (di-1)*(T) + topic ] = dp[ (di-1)*(T)  + topic ] + 1 
			  ztot[ topic ]= ztot[ topic ] + 1 
		}
		
	/*Calculate Initial Likelihood*/
	 if (likelihood==1) {  
		dp_mat = colshape(dp, T)
		dp_mat = dp_mat + J(rows(dp_mat), T, ALPHA)
		
		wp_mat = colshape(wp, T)
		wp_mat = wp_mat + J(rows(wp_mat), T, BETA)
		
		/* normalize dp */
		rowsum = J(1,T,rowsum(dp_mat))
		dp_mat = dp_mat :/ rowsum
		/* normalize wp */
		colsum = J(rows(wp_mat),1,colsum(wp_mat))
		wp_mat = wp_mat :/ colsum
		
		likeh = 0
		for (i = 1; i <= W; i++) {
			wi  = word[i] /*current word*/
			di  = doc[i] /*current document */
			
			prob_sum = 0
			for (t = 1; t <= T; t++) {
				prob_sum = prob_sum + dp_mat[di,t]*wp_mat[wi,t]
			}					
			likeh = likeh + ln(prob_sum)
			likeh_old = likeh

		}				  
	}
	
	/* 3)  Start of  the Gibbs Sampler*/
	"*************************************"
	"******* Running Gibbs Sampler *******"
	"*************************************"
	"                                     "
	  for (iter=1; iter<=NN; iter++) { 
		  if (mod(iter,50) == 0){
			  "Iteration:" 
			  iter /*Display Current Iteration*/

			  /*Calculate Current Likelihood*/
			  if (likelihood==1) {  
			  	dp_mat = colshape(dp, T)
				dp_mat = dp_mat + J(rows(dp_mat), T, ALPHA)
				
				wp_mat = colshape(wp, T)
				wp_mat = wp_mat + J(rows(wp_mat), T, BETA)
				
				/* normalize dp */
				rowsum = J(1,T,rowsum(dp_mat))
				dp_mat = dp_mat :/ rowsum
				/* normalize wp */
				colsum = J(rows(wp_mat),1,colsum(wp_mat))
				wp_mat = wp_mat :/ colsum
				
				likeh = 0
				for (i = 1; i <= W; i++) {
					wi  = word[i] /*current word*/
					di  = doc[i] /*current document */
					
					prob_sum = 0
					for (t = 1; t <= T; t++) {
						prob_sum = prob_sum + dp_mat[di,t]*wp_mat[wi,t]
					}					
					likeh = likeh + ln(prob_sum)

				}
				
				likeh_change = likeh-likeh_old
				likeh_old = likeh
				"Log-Likelihood (Change):"
				strofreal( round(likeh) ) + "("  + strofreal(round(likeh_change)) + ")"
				
			  }
			  
			  
		  }
		  for (i = 1; i <= W; i++) { /*Loop over all word tokens*/
			  wi  = word[i] /*current word*/
			  di  = doc[i] /*current document */
			  topic = topic_ass[i] /*current topic*/
			  
			  /*Variables to chose right row*/
			  wioffset = (wi-1)*T
			  dioffset = (di-1)*T  
			  
			  
			  /*Delete Topics assignment*/
			  ztot[topic]=ztot[topic]-1      
			  wp[wioffset+topic] = wp[wioffset+topic] - 1  
			  dp[dioffset+topic] = dp[dioffset+topic] - 1  
			  
			  /* calculate posterior probability for all topics for the current token*/
			  totprob = 0 		  
			  for (j = 1; j <= T; j++) { 
				  probs[j] = (wp[ wioffset+j ] + BETA)/( ztot[j]+ WBETA) * (  dp[ dioffset+ j ] +  ALPHA )
				  totprob =totprob +  probs[j] 
			  } 
			  
			  /* update topic for current token based on posterior*/
			  r = totprob *  runiform(1,1)
			  max = probs[1] 
			  topic = 1 
			  while (r>max) { 
				  topic=topic + 1; 
				  max = max + probs[topic]; 
			  }
			  
			  /* update assignment vectors */
			  topic_ass[i] = topic 
			  wp[wioffset+topic] = wp[wioffset+topic] + 1 
			  dp[dioffset+topic] = dp[dioffset+topic] + 1 
			  ztot[topic]=ztot[topic] + 1
			  
			  } 
		  } 
			
		dp_store = dp
		wp_store = wp
		  
		  /* draw additional samples with a distance of sampling_iter */
		  for (s=2 ; s<=samples; s++){
			"Sample:"
			s
			  for (iter=1; iter<=sampling_iter; iter++) { 
				  if (mod(iter,50) == 0){
					  "Iteration:" 
					  iter /*Display Current Iteration*/
					  }
				  for (i = 1; i <= W; i++) { /*Loop over all word tokens*/
					  wi  = word[i] /*current word*/
					  di  = doc[i] /*current document */
					  topic = topic_ass[i] /*current topic*/
					  
					  /*Variables to chose right row*/
					  wioffset = (wi-1)*T
					  dioffset = (di-1)*T  
					  
					  
					  /*Delete Topics assignment*/
					  ztot[topic]=ztot[topic]-1      
					  wp[wioffset+topic] = wp[wioffset+topic] - 1  
					  dp[dioffset+topic] = dp[dioffset+topic] - 1  
					  
					  /* calculate posterior probability for all topics for the current token*/
					  totprob = 0 		  
					  for (j = 1; j <= T; j++) { 
						  probs[j] = (wp[ wioffset+j ] + BETA)/( ztot[j]+ WBETA) * (  dp[ dioffset+ j ] +  ALPHA )
						  totprob =totprob +  probs[j] 
					  } 
					  
					  /* update topic for current token based on posterior*/
					  r = totprob *  runiform(1,1)
					  max = probs[1] 
					  topic = 1 
					  while (r>max) { 
						  topic=topic + 1; 
						  max = max + probs[topic]; 
					  }
					  
					  /* update assignment vectors */
					  topic_ass[i] = topic 
					  wp[wioffset+topic] = wp[wioffset+topic] + 1 
					  dp[dioffset+topic] = dp[dioffset+topic] + 1 
					  ztot[topic]=ztot[topic] + 1
					  
					  } 
				  } 
			dp_store = dp_store + dp
			wp_store = wp_store + wp
		  }

	dp_mat = colshape(dp_store, T)
	dp_mat = dp_mat + J(rows(dp_mat), T, ALPHA)*samples

	
	wp_mat = colshape(wp_store, T)
	wp_mat = wp_mat + J(rows(wp_mat), T, BETA)*samples
	
	if (normalize==1) {
	/* normalize dp */
		rowsum = J(1,T,rowsum(dp_mat))
		dp_mat = dp_mat :/ rowsum
		/* normalize wp */
		colsum = J(rows(wp_mat),1,colsum(wp_mat))
		wp_mat = wp_mat :/ colsum
	}	  
		  
	if (name_new_var=="") name_new_var="topic_prob"	  
	/*4) Store the Document Probabilities in Stata*/
	for (t=1; t<=T; t++){
		var = st_addvar("double", name_new_var + strofreal(t))  
		st_store(., name_new_var + strofreal(t) ,dp_mat[.,t])
	}
	/* take care of possible slash at the end of path */
	if (substr(path,-1)!="/" & substr(path,-1)!="\") path = path + "/"
	if (path=="/") path = ""
	/* stores word_prob_matrix */
	if (mat_save == 1) {
		file_out = fopen(path + "word_prob", "rw")
		fputmatrix( file_out, wp_mat)
		fputmatrix( file_out, word_list)
		fclose(file_out)
	}

}
end

