#%% tokenize text def tokenize_corpus(corpus): #%% imports import nltk import string from nltk.tokenize import word_tokenize from HanTa import HanoverTagger as ht stop_words = nltk.corpus.stopwords.words('german') tagger = ht.HanoverTagger('morphmodel_ger.pgz') from tqdm import tqdm # %% read the files to a list corpus_tokenized = corpus for idx, t_text in tqdm(enumerate(corpus_tokenized)): #%% get the words from the text t_text = str(t_text) tokens = word_tokenize(t_text, language='german') tokens = list(filter(lambda token: token not in string.punctuation, tokens)) #%% get only the nouns nouns = tagger.tag_sent(tokens) nouns = [lemma for (word, lemma, pos) in nouns if pos == "NN" or pos == "NE"] #%% mount it back corpus_tokenized[idx] = nouns #%% output-layer return corpus_tokenized #%% calculate the entropy def corpus_entropy(corpus): ''' how much differ the docs, compared to the whole corpus? ''' #%% input layer #corpus_tokenized = tokenize_corpus(corpus) import numpy as np text1 = np.asarray(corpus[0]) is_tokenized = bool(text1.ndim) if is_tokenized: corpus_not_tokenized = [" ".join(i_text) for i_text in corpus] else: corpus_not_tokenized = corpus # corpus_not_tokenized = [nltk.tokenize.word_tokenize(i_text, language='german') for i_text in corpus] #%% count the word-occurences from sklearn.feature_extraction.text import CountVectorizer import numpy as np vectorizer = CountVectorizer() try: X = vectorizer.fit_transform(corpus_not_tokenized) except: return np.nan, np.nan df = X.toarray() #%% calculate the entropy from scipy.stats import entropy import numpy as np corspus_tf = sum(df) corpus_mean = np.mean(df,0) ent_values = [] for i in range(0, df.shape[0]): document_tf = df[i, :] a = entropy(document_tf, qk=corspus_tf) # ent_values.append(a) #%% output-layer entropy_mean = np.nanmean(ent_values) entropy_std = np.nanstd(ent_values) #%% return entropy_mean, entropy_std if __name__ == '__main__': corpus_entropy()