from string import punctuation
from collections import Counter
from nltk import bigrams, trigrams, NaiveBayesClassifier, classify
from nltk.corpus import PlaintextCorpusReader, stopwords
# PREKODE
def load_corpus(corpus_root):
'''Tar inn rotmappen, som en streng, av
et korpus. Returnerer NLTK-korpus.'''
return PlaintextCorpusReader(corpus_root, '.*\.txt')
# PREKODE
def clean_words(words):
'''Tar inn en liste med token.
Returnerer listen uten tegnsetting og stoppord'''
stopwords_nor = stopwords.words('norwegian')
return [word.lower() for word in words if word not in punctuation and word not in stopwords_nor]
# PREKODE
def split_data(pos_feats, neg_feats):
'''Tar inn lister med hhv. positive og negative trekk.
Returnerer listene satt sammen og delt inn i train_set, dev_set, test_set.'''
test_set = pos_feats[:122] + neg_feats[:122]
dev_set = pos_feats[122:182] + neg_feats[122:182]
train_set = pos_feats[182:] + neg_feats[182:]
return train_set, dev_set, test_set
# OPPGAVE 4.2
def feature_extractor_top_1000(document):
features = {}
fd = Counter(document)
frequent_words = [word for word, count in fd.most_common(1000)]
for word in frequent_words:
features[f'contains({word})'] = True
return features
# OPPGAVE 4.3.1
def feature_extractor_bow(document):
features = {}
for word in clean_words(set(document)):
features[f'contains({word})'] = True
return features
# OPPGAVE 4.3.2
def feature_extractor_bow_bigrams(document):
features = feature_extractor_bow(document)
for bigram in set(bigrams(document)):
features[f'contains({bigram})'] = True
return features
# OPPGAVE 4.3.3
def feature_extractor_bow_bigrams_trigrams(document):
features = feature_extractor_bow_bigrams(document)
for trigram in set(trigrams(document)):
features[f'contains({trigram})'] = True
return features
def main():
# OPPGAVE 4.1
reviews = load_corpus('NoReC/')
pos_reviews = []
neg_reviews = []
for file in reviews.fileids():
words = [word.lower() for word in reviews.words(file)]
if file.startswith('pos'):
pos_reviews.append(words)
elif file.startswith('neg'):
neg_reviews.append(words)
# OPPGAVE 4.2
print('1000 MEST FREKVENTE ORD ==========================')
neg_features = [(feature_extractor_top_1000(review), 'neg') for review in neg_reviews]
pos_features = [(feature_extractor_top_1000(review), 'pos') for review in pos_reviews]
train_set_1000, dev_set_1000, test_set_1000 = split_data(neg_features, pos_features)
print(f'len(train_set)={len(train_set_1000)} len(dev_set)={len(dev_set_1000)} len(test_set)={len(test_set_1000)}')
classifier_1000 = NaiveBayesClassifier.train(train_set_1000)
accuracy = classify.accuracy(classifier_1000, dev_set_1000)
print('N?yaktighet p? dev_set:', accuracy)
classifier_1000.show_most_informative_features(30)
# Eksempler p? stoppord er "har", "henne" og "som". Dette er ord som tendensielt
# forekommer ofte i en tekst uten at de bidrar med s?rlig betydning for modellen.
# Ved ? luke ut tegnsetting og stoppord kan vi oppn? h?yere n?yaktighet, fordi
# vi f?r f?rre trekk med "n?ytalt" sentiment.
# OPPGAVE 4.3.1
print('\nBAG OF WORDS ===================================')
neg_features = [(feature_extractor_bow(review), 'neg') for review in neg_reviews]
pos_features = [(feature_extractor_bow(review), 'pos') for review in pos_reviews]
train_set_bow, dev_set_bow, test_set_bow = split_data(neg_features, pos_features)
print(f'len(train_set)={len(train_set_bow)} len(dev_set)={len(dev_set_bow)} len(test_set)={len(test_set_bow)}')
classifier_bow = NaiveBayesClassifier.train(train_set_bow)
accuracy = classify.accuracy(classifier_bow, dev_set_bow)
print('N?yaktighet p? dev_set:', accuracy)
classifier_bow.show_most_informative_features(10)
# OPPGAVE 4.3.2
print('\nBAG OF WORDS + BIGRAM ==========================')
neg_features = [(feature_extractor_bow_bigrams(review), 'neg') for review in neg_reviews]
pos_features = [(feature_extractor_bow_bigrams(review), 'pos') for review in pos_reviews]
train_set_bi, dev_set_bi, test_set_bi = split_data(neg_features, pos_features)
print(f'len(train_set)={len(train_set_bi)} len(dev_set)={len(dev_set_bi)} len(test_set)={len(test_set_bi)}')
classifier_bigram = NaiveBayesClassifier.train(train_set_bi)
accuracy = classify.accuracy(classifier_bigram, dev_set_bi)
print('N?yaktighet p? dev_set:', accuracy)
classifier_bigram.show_most_informative_features(10)
# OPPGAVE 4.3.3
print('\nBAG OF WORDS + BIGRAM + TRIGRAM ================')
neg_features = [(feature_extractor_bow_bigrams_trigrams(review), 'neg') for review in neg_reviews]
pos_features = [(feature_extractor_bow_bigrams_trigrams(review), 'pos') for review in pos_reviews]
train_set_tri, dev_set_tri, test_set_tri = split_data(neg_features, pos_features)
print(f'len(train_set)={len(train_set_tri)} len(dev_set)={len(dev_set_tri)} len(test_set)={len(test_set_tri)}')
classifier_trigram = NaiveBayesClassifier.train(train_set_tri)
accuracy = classify.accuracy(classifier_trigram, dev_set_tri)
print('N?yaktighet p? dev_set:', accuracy)
classifier_trigram.show_most_informative_features(10)
# OPPGAVE 4.4
print('\nModellen med BoW, bigram og trigram gir h?yest n?yaktighet p? dev_set.')
accuracy = classify.accuracy(classifier_trigram, test_set_tri)
print('N?yaktighet p? test_set:', accuracy)
# Forslag til forbedring:
# Ta hensyn til negasjon slik at f.eks "ikke bra" blir "ikke NOT_bra"
# Lemmatisering
if __name__ == '__main__':
main()