from string import punctuation
from collections import Counter
from nltk import bigrams, trigrams, NaiveBayesClassifier, classify
from nltk.corpus import PlaintextCorpusReader, stopwords
def load_corpus(corpus_root):
'''Tar inn rotmappen av et korpus.
Returnerer NLTK-korpus.'''
return PlaintextCorpusReader(corpus_root, '.*\.txt')
def clean_words(words):
'''Tar inn en liste med token.
Returnerer listen uten tegnsetting og stoppord'''
stopwords_nor = stopwords.words('norwegian')
return [word.lower() for word in words if word not in punctuation and word not in stopwords_nor]
def split_data(pos_feats, neg_feats):
'''Tar inn lister med hhv. positive og negative trekk.
Returnerer listene satt sammen og delt inn i train_set, dev_set, test_set.'''
test_set = pos_feats[:122] + neg_feats[:122]
dev_set = pos_feats[122:182] + neg_feats[122:182]
train_set = pos_feats[182:] + neg_feats[182:]
return train_set, dev_set, test_set
# OPPGAVE 4.2
def feature_extractor_top_1000(document):
features = {}
# din kode her...
return features
# OPPGAVE 4.3.1
def feature_extractor_bow(document):
features = {}
# din kode her...
return features
# OPPGAVE 4.3.2
def feature_extractor_bow_bigrams(document):
features = {}
# din kode her...
return features
# OPPGAVE 4.3.3
def feature_extractor_bow_bigrams_trigrams(document):
features = {}
# din kode her...
return features
def main():
# OPPGAVE 4.1
# din kode her...
# OPPGAVE 4.2
print('1000 MEST FREKVENTE ORD =========================================')
# din kode her...
# svar p? teorisp?rsm?l her...
# OPPGAVE 4.3.1
print('\nBAG OF WORDS ==================================================')
# din kode her...
# OPPGAVE 4.3.2
print('\nBAG OF WORDS + BIGRAM =========================================')
# din kode her...
# OPPGAVE 4.3.3
print('\nBAG OF WORDS + BIGRAM + TRIGRAM ===============================')
# din kode her...
# OPPGAVE 4.4
print('\nModellen med ... gir h?yest n?yaktighet p? dev_set.')
# din kode her...
# forslag til forbedring her...
if __name__ == '__main__':
main()