I am working with naive bayes multinomial model. I am supposed to use the pseudo code seen in the train method. So those are my questions:
1)I have put most code in but I have some problems mainly in extracting the vocabulary, counting the number of documents in the class, and concatenate the text of all docs in class.
2)I also noticed that the train method I need only requires the documents(aka train_doc). So I have no idea how to tweak to obtain C which is the class.
def train(self, documents):
# TRAINMULTINOMIALNB(C,D)
# 1 V <-- EXTRACTVOCABULARY(D)
# 2 N <-- COUNTDOCS(D)
# 3 for each c in C
# 4 do Nc <-- COUNTDOCSINCLASS(D, c)
# 5 prior[c] <-- Nc/N
# 6 textc <-- CONCATENATETEXTOFALLDOCSINCLASS(D, c)
# 7 for each t in V
# 8 do Tct <-- COUNTTOKENSOFTERM(textc, t)
# 9 for each t in V
# 10 do condprob[t][c] <-- Tct+1
# 11 return V, prior, condprob
"""
prior={}
N = len(documents)
#Vocab
V = Counter()
for d in documents:
V.update(doc[***])
#COUNTDOCSINCLASS(C,D)
cdic = Counter(C)
for d2 in documents:
for label in C:
cdic.update({label:int(math.ceil(float(doc[***])))})
#CONCATENATETEXTOFALLDOCSINCLASS(documents,C)
ctoadic = defaultdict(Counter)
for d3 in document:
for label2 in C:
if(float(***)>0):
ctoadic[label].update(doc[***])
#used to get term by class it is in
tii = defaultdict(Counter)
for label,word in ctoadic.iteritems():
for w in word:
tii[w].update({l:word[w]})
#getCondProb(tii,ctofadic,C)
gcp = defaultdict(lambda: defaultdict(float))
tnw ={} #total number of words in that label
for l,v inctofadic.iteritems():
tnwl[l] = sum(v.values())
for w,count in tii.iteritems():
#for 0 occurences
z = [zeroo for zeroo in C if zeroo not in count.keys()]
for ling in z:
gcp[w[ling]=1.0/(len(ctofadic[ling])+tnw[ling])
for ling,val in count.iteritems():
gcp[w][ling]=float(val+1)/(len(ctofadic[ling])+tnw[ling])
#Prior
for c in C:
prior[c] = cdic[c] / float(N)
return V,prior,gcp