#%% tokenize text
def tokenize_corpus(corpus):

    #%% imports
    import nltk
    import string
    from nltk.tokenize import word_tokenize
    from HanTa import HanoverTagger as ht
    stop_words = nltk.corpus.stopwords.words('german')
    tagger = ht.HanoverTagger('morphmodel_ger.pgz')
    from tqdm import tqdm

    # %% read the files to a list
    corpus_tokenized = corpus
    for idx, t_text in tqdm(enumerate(corpus_tokenized)):

        #%% get the words from the text
        t_text = str(t_text)
        tokens = word_tokenize(t_text, language='german')
        tokens = list(filter(lambda token: token not in string.punctuation, tokens))

        #%% get only the nouns
        nouns = tagger.tag_sent(tokens)
        nouns = [lemma for (word, lemma, pos) in nouns if pos == "NN" or pos == "NE"]

        #%% mount it back
        corpus_tokenized[idx] = nouns

    #%% output-layer

    return corpus_tokenized

#%% calculate the entropy
def corpus_entropy(corpus):
    '''
        how much differ the docs, compared to the whole corpus?
    '''
    #%% input layer
    #corpus_tokenized = tokenize_corpus(corpus)
    import numpy as np
    text1 = np.asarray(corpus[0])
    is_tokenized = bool(text1.ndim)
    if is_tokenized:
        corpus_not_tokenized = [" ".join(i_text) for i_text in corpus]
    else:
        corpus_not_tokenized = corpus
        # corpus_not_tokenized = [nltk.tokenize.word_tokenize(i_text, language='german') for i_text in corpus]

    #%% count the word-occurences
    from sklearn.feature_extraction.text import CountVectorizer
    import numpy as np
    vectorizer = CountVectorizer()
    try:
        X = vectorizer.fit_transform(corpus_not_tokenized)
    except:
        return np.nan, np.nan

    df = X.toarray()

    #%% calculate the entropy
    from scipy.stats import entropy
    import numpy as np
    corspus_tf = sum(df)
    corpus_mean = np.mean(df,0)
    ent_values = []
    for i in range(0, df.shape[0]):
        document_tf = df[i, :]
        a = entropy(document_tf, qk=corspus_tf) #
        ent_values.append(a)

    #%% output-layer
    entropy_mean = np.nanmean(ent_values)
    entropy_std = np.nanstd(ent_values)

#%%
    return entropy_mean, entropy_std

if __name__ == '__main__':
    corpus_entropy()