import codecs
import nltk
from nltk.probability import FreqDist

def read_german_text(filename):

    textfile = codecs.open(filename, 'br', "latin-1")
    text= textfile.read()
    textfile.close()

    return text

#%%
def get_most_frequent_words(text, n_words = 10):

    from nltk.probability import FreqDist
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    word_list = tokenizer.tokenize(text)
    freq_dist = FreqDist(word_list)
    top_words = freq_dist.most_common(n_words)
    top_words = [word[0] for word in top_words]

    return top_words

#%%
def get_most_discriminant_words(text, text_vectorizer, text_counterizer, n_words = 10):

    #%% get the vector
    if not text:
        top_words = []
        return top_words

    tf_idf_vector = text_vectorizer.transform(text_counterizer.transform(text))

    #%% define the coo-function
    def sort_coo(coo_matrix):
        tuples = zip(coo_matrix.col, coo_matrix.data)
        tuples = set(tuples)
        tuples = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
        return tuples

    #%% define the sort function
    def extract_topn_from_vector(feature_names, sorted_items, n_words):
        """get the feature names and tf-idf score of top n items"""

        # use only topn items from vector
        sorted_items = sorted_items[:n_words]

        score_vals = []
        feature_vals = []

        # word index and corresponding tf-idf score
        for idx, score in sorted_items:
            # keep track of feature name and its corresponding score
            score_vals.append(round(score, 3))
            feature_vals.append(feature_names[idx])

        # create a tuples of feature,score
        # results = zip(feature_vals,score_vals)
        results = {}
        for idx in range(len(feature_vals)):
            results[feature_vals[idx]] = score_vals[idx]

        return results

    #%% sort the results
    sorted_items = sort_coo(tf_idf_vector.tocoo())

    # extract only the top n; n here is 10
    feature_names = text_counterizer.get_feature_names()
    keywords = extract_topn_from_vector(feature_names, sorted_items, n_words)
    top_words = list(keywords.keys())

    return top_words

#%%
def regexp(pattern, text):
    import re
    index_start, index_stop = [], []
    value = []
    for match in re.finditer(pattern, text):
        index_start.append(match.start())
        index_stop.append(match.end())
        value.append(match.group())

    return index_start, index_stop