import codecs import nltk from nltk.probability import FreqDist def read_german_text(filename): textfile = codecs.open(filename, 'br', "latin-1") text= textfile.read() textfile.close() return text #%% def get_most_frequent_words(text, n_words = 10): from nltk.probability import FreqDist tokenizer = nltk.RegexpTokenizer(r"\w+") word_list = tokenizer.tokenize(text) freq_dist = FreqDist(word_list) top_words = freq_dist.most_common(n_words) top_words = [word[0] for word in top_words] return top_words #%% def get_most_discriminant_words(text, text_vectorizer, text_counterizer, n_words = 10): #%% get the vector if not text: top_words = [] return top_words tf_idf_vector = text_vectorizer.transform(text_counterizer.transform(text)) #%% define the coo-function def sort_coo(coo_matrix): tuples = zip(coo_matrix.col, coo_matrix.data) tuples = set(tuples) tuples = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True) return tuples #%% define the sort function def extract_topn_from_vector(feature_names, sorted_items, n_words): """get the feature names and tf-idf score of top n items""" # use only topn items from vector sorted_items = sorted_items[:n_words] score_vals = [] feature_vals = [] # word index and corresponding tf-idf score for idx, score in sorted_items: # keep track of feature name and its corresponding score score_vals.append(round(score, 3)) feature_vals.append(feature_names[idx]) # create a tuples of feature,score # results = zip(feature_vals,score_vals) results = {} for idx in range(len(feature_vals)): results[feature_vals[idx]] = score_vals[idx] return results #%% sort the results sorted_items = sort_coo(tf_idf_vector.tocoo()) # extract only the top n; n here is 10 feature_names = text_counterizer.get_feature_names() keywords = extract_topn_from_vector(feature_names, sorted_items, n_words) top_words = list(keywords.keys()) return top_words #%% def regexp(pattern, text): import re index_start, index_stop = [], [] value = [] for match in re.finditer(pattern, text): index_start.append(match.start()) index_stop.append(match.end()) value.append(match.group()) return index_start, index_stop