improved pipelines for new topic modeling tasks

10d56795 · max-laptop · c7bab612 · 10d56795 · 10d56795 · 10d56795
Commit 10d56795 authored Dec 19, 2024 by max-laptop
16 changed files
--- a/README.md
+++ b/README.md
@@ -18,13 +18,18 @@ Feel free to use and adapt the scripts to your own needs.
 ## Requirements
 Create a new environment, then install the required python packages with:
 ```pip install -r requirements.txt```
+(so far tested with python 3.10)
 The script ```database_preparation/preprocess.py``` requires some nltk corporas:
 ```
 import nltk
 nltk.download('stopwords')
 nltk.download('punkt')
+nltk.download('punkt_tab')
 ```
\ No newline at end of file
--- a/TextClassification/classification_for_cluster_evaluation.py
+++ b/TextClassification/classification_for_cluster_evaluation.py
@@ -15,9 +15,9 @@ from sklearn.model_selection import KFold
 from sklearn.svm import SVC
 from sklearn.pipeline import Pipeline
 import nltk
+import pickle
 import datasets
 import pyarrow as pa
-import pickle
 fold_amount = 10
@@ -102,6 +102,7 @@ def cross_validate_label_corpus_with_simple_SVM(labels, path2corpus = "./databas
    returns 10-fold-cross-validated accuracy value
    """
    texts = pd.read_pickle(path2corpus)
    from database_preparation.utils_labeled_datasets import text_label_2_labeled_dataset

--- a/TextClassification/classification_metrics.py
+++ b/TextClassification/classification_metrics.py
@@ -6,7 +6,7 @@ import os
 import sys
 import matplotlib.pyplot as plt
 from sklearn.metrics import confusion_matrix
-from sklearn.metrics import plot_confusion_matrix
+#from sklearn.metrics import ConfusionMatrixDisplay # plot_confusion_matrix # TODO: replace with ConfusionMatrixDisplay
 import seaborn as sn
 import pandas as pd
 import pickle

--- a/TextClustering/basedOn_BOW/HDBSCAN_Diagnosis.py
+++ b/TextClustering/basedOn_BOW/HDBSCAN_Diagnosis.py
-#%% argsparse section
+# argsparse section
 import sys, os
 sys.path.append(os.getcwd())
 from TextClustering.argsparse_clustering_preamble import argsparse_preamble
 args = argsparse_preamble()
-from database_preparation.utils_labeled_datasets import is_text_lst_tokenized, is_text_lst_tfidf_vectorized
-import pickle
 from sklearn.feature_extraction.text import TfidfVectorizer
 import matplotlib.pyplot as plt
 import pandas as pd
@@ -18,21 +16,28 @@ from nltk import RegexpTokenizer
 from TextClustering.utils_metrics import ClusterMetrics
 from database_preparation.preprocess import print_meta_data
 from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
+import json
 tokenizer = RegexpTokenizer(r'\w+')
-#%% load the data
+# load the data
-with open(args.path2corpus, 'rb') as f:
+'''with open(args.path2corpus, 'rb') as f:
-    diag_lst = pickle.load(f)
+    doc_list = pickle.load(f)'''
+assert args.path2corpus[-4:] == '.pkl'
+corpus_df = pd.read_pickle(args.path2corpus)
+corpus_dict = {case_id: corpus_df.loc[corpus_df['case_id'] == case_id, 'preprocessed_text'].values[0] for case_id in corpus_df['case_id']}
+doc_list = corpus_df['preprocessed_text'].tolist()
-print_meta_data(args.path2corpus)
+#print_meta_data(args.path2corpus)
 def identity(word):
    return word
-text_is_vectorized = is_text_lst_tfidf_vectorized(args.path2corpus)
+text_is_vectorized = False #is_text_lst_tfidf_vectorized(args.path2corpus)
-if not (is_text_lst_tokenized(args.path2corpus) or text_is_vectorized):
+if not type(doc_list[0]) == list:
    print("Error: " + args.path2corpus + ' has to be a list of tokenized texts or tfidf-vectorized texts!')
    exit(1)
@@ -42,11 +47,13 @@ def create_vectorizer(data):
    return vec
 if text_is_vectorized:
-    text_features = diag_lst
+    text_features = doc_list
 else:
-    text_features = create_vectorizer(diag_lst)
+    text_features = create_vectorizer(doc_list)
-#%% perform umap for dimension-reduction (for cluster-detection)
+print(f"tf-idf vectorized corpus generated {text_features.shape[0]} vectors of length {text_features.shape[1]}.")
+# perform umap for dimension-reduction (for cluster-detection)
 umap_text_features = umap.UMAP(n_neighbors=15,
                            n_components=5,
                            metric='cosine').fit_transform(text_features)
@@ -57,7 +64,7 @@ umap_text_features2D = umap.UMAP(n_neighbors=15,
                                 min_dist=0.0, metric='cosine').fit_transform(text_features)
 if args.find_k_value:
-    # %% perform hdbscan for cluster-dectection with different cluster sizes to find a good solution...by eye..
+    #  perform hdbscan for cluster-dectection with different cluster sizes to find a good solution...by eye..
    list_cluster_size = [int(k) for k in np.arange(3, 23, 1)]
    s_score, n_cluster, svm_scores = [], [], []
    for i_cluster_size in list_cluster_size:
@@ -67,20 +74,20 @@ if args.find_k_value:
                                  cluster_selection_method='eom').fit(umap_text_features)
-        result = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
+        df_clustering_result2d = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
-        result['labels'] = cluster.labels_.tolist() # cluster.labels_
+        df_clustering_result2d['labels'] = cluster.labels_.tolist() # cluster.labels_
-        print(np.unique(result.labels))
+        print(np.unique(df_clustering_result2d.labels))
-        #%% Visualize clusters
+        # Visualize clusters
-        outliers = result.loc[result.labels == -1, :]
+        outliers = df_clustering_result2d.loc[df_clustering_result2d.labels == -1, :]
-        clustered = result.loc[result.labels != -1, :]
+        clustered = df_clustering_result2d.loc[df_clustering_result2d.labels != -1, :]
        clustered['labels'] = [str(i) for i in clustered['labels']]
        evaluation = ClusterMetrics(umap_text_features, cluster.labels_.tolist())
        s_score.append(evaluation.s_score)
        svm_scores.append(
-            cross_validate_label_corpus_with_simple_SVM(cluster.labels_.tolist(), args.path2corpus + '.pkl',
+            cross_validate_label_corpus_with_simple_SVM(cluster.labels_.tolist(), args.path2corpus,
                                                        False))
        n_cluster.append(len(np.unique(cluster.labels_.tolist())))
@@ -103,30 +110,36 @@ if args.find_k_value:
    exit()
-#%% perform hdbscan with best cluster size
+# perform hdbscan with best cluster size
 cluster = hdbscan.HDBSCAN(min_cluster_size=args.k_value,
                              metric='euclidean',
                              cluster_selection_method='eom').fit(umap_text_features)
-result = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
+df_clustering_result2d = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
-result['labels'] = cluster.labels_.tolist() # cluster.labels_
+df_clustering_result2d['labels'] = cluster.labels_.tolist() # cluster.labels_
-clusters = np.int8([str(i) for i in result['labels']])
+clusters = np.int8([str(i) for i in df_clustering_result2d['labels']])
-outliers = result.loc[result.labels == -1, :]
+outliers = df_clustering_result2d.loc[df_clustering_result2d.labels == -1, :]
-clusters_no_outliers = result.loc[result.labels != -1, :]
+clusters_no_outliers = df_clustering_result2d.loc[df_clustering_result2d.labels != -1, :]
-unique_clusters = np.unique(result.labels)
+unique_clusters = np.unique(df_clustering_result2d.labels)
-print(f"\nfound {len(unique_clusters[unique_clusters>-1])} clusters.\n")
 # save umaped vectors:
-df = pd.read_pickle(args.df_cases_file)
+#df = pd.read_pickle(args.df_cases_file)
-df['umapX_HDBSCAN'] = umap_text_features2D[:, 0]
+corpus_df['umapX_HDBSCAN'] = umap_text_features2D[:, 0]
-df['umapY_HDBSCAN'] = umap_text_features2D[:, 1]
+corpus_df['umapY_HDBSCAN'] = umap_text_features2D[:, 1]
-df['label_HDBSCAN'] = clusters
+corpus_df['label_HDBSCAN'] = clusters
-df.to_pickle(args.df_cases_file)
+corpus_df.to_pickle(args.path2corpus)
+corpus_df.to_csv(args.path2corpus.replace('.pkl', '.csv'))
+#corpus_df.to_excel(args.path2corpus.replace('.pkl', '.xlsx'))
-#%% and evaluate the results with several metrics (not needing ground truth)
-evaluation = ClusterMetrics(umap_text_features[result.labels >= 0,], clusters_no_outliers.labels.tolist(),
+print(f"updated {args.path2corpus} with umapX_HDBSCAN, umapY_HDBSCAN and label_HDBSCAN.")
-                            file_name= "TextClustering/cluster_metrics/HDBSCAN_metrics.pkl")
+print(corpus_df.head())
+print(f"cluster count: {len(unique_clusters)}")
+print(f"outliers: {len(outliers)}")
+# and evaluate the results with several metrics (not needing ground truth)
+evaluation = ClusterMetrics(umap_text_features[df_clustering_result2d.labels >= 0,], clusters_no_outliers.labels.tolist(),
+                            file_name=args.path2corpus.replace('.pkl', '').replace('.json', '').replace('.df', '') + '_HDBSCAN_metrics')
 evaluation.write_to_file()
--- a/TextClustering/basedOn_BOW/LDA_Diagnosis.py
+++ b/TextClustering/basedOn_BOW/LDA_Diagnosis.py
-#%% argsparse section
+# argsparse section
 import sys, os
 sys.path.append(os.getcwd())
@@ -13,7 +13,7 @@ if not is_text_lst_tokenized(args.path2corpus):
            'Please pass texts list where each text is tokenized (a list of words).')
    exit(1)
-#%% import section
+# import section
 import pickle
 import gensim
 import gensim.corpora as corpora
@@ -26,20 +26,20 @@ from tqdm import tqdm
 from database_preparation.preprocess import print_meta_data
 from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
-#%% load the diag and main_diag list
+# load the diag and main_diag list
 with open(args.path2corpus, 'rb') as f:
    diag_lst = pickle.load(f)
 print_meta_data(args.path2corpus)
-#%% prepare database_preparation for LDA-model-trainng
+# prepare database_preparation for LDA-model-trainng
 # Creates, which is a mapping of word IDs to words.
 words = corpora.Dictionary(diag_lst)
 # Turns each document into a bag of words.
 corpus = [words.doc2bow(doc) for doc in diag_lst] #is that allready a model
-#%% train LDA-model with different number of clusters
+# train LDA-model with different number of clusters
 if args.find_k_value:
    limit=21; start=5; step=1
    coherence_values = []
@@ -72,7 +72,7 @@ if args.find_k_value:
        n_cluster.append(len(np.unique(np.asarray(predictedCluster))))
        print("coherence: " + str(coherencemodel.get_coherence()))
-    #%% visualize the results
+    # visualize the results
    x = range(start, limit, step)
    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()
@@ -92,7 +92,7 @@ if args.find_k_value:
    plt.show()
    exit()
-#%% train LDA-model
+# train LDA-model
 lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=args.k_value,
@@ -103,7 +103,7 @@ lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           per_word_topics=True)
-#%% get topic weights / features
+# get topic weights / features
 topic_weights = []
 for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])
@@ -111,7 +111,7 @@ for i, row_list in enumerate(lda_model[corpus]):
 # Array of topic weights
 text_features = pd.DataFrame(topic_weights).fillna(0).values
-#%% get prediction
+# get prediction
 predictedCluster= np.argmax(text_features, axis=1)
 # and add it to the dataframe
@@ -126,7 +126,7 @@ df['pcaX_LDA'] = reduced_features[:, 0]
 df['pcaY_LDA'] = reduced_features[:, 1]
-#%% and with umap
+# and with umap
 import umap
 umap_text_features2D = umap.UMAP(n_neighbors=15,
                                     n_components=2,
@@ -136,7 +136,7 @@ df['umapX_LDA'] = umap_text_features2D[:, 0]
 df['umapY_LDA'] = umap_text_features2D[:, 1]
 df.to_pickle(args.df_cases_file)
-#%% evalute the model
+# evalute the model
 from TextClustering.utils_metrics import ClusterMetrics
 evaluation = ClusterMetrics(text_features, predictedCluster,
                            file_name= "TextClustering/cluster_metrics/LDA_metrics.pkl")

--- a/TextClustering/generate_topicwords.py
+++ b/TextClustering/generate_topicwords.py
@@ -6,26 +6,25 @@ import pickle
 import openpyxl
 # parameters:
-df_cases_file = "database/df_cases.pkl"
+df_cases_file = "database/bow_short_diag/bow_short_diag.df.pkl"
 print_latex = False
 filter_stop_words = True
 path2umap_pics = 'TextClustering/plots/UMAP/'
 save_umap_picture_in_table = True
-path2corpus_bow_preprocessed = 'database/bow_prepro_diag.pkl'
+stopword_filtered_corpus = True
-path2corpus_embedding_preprocessed = 'database/embedding_prepro_diag.pkl'
 ####### functions ##########
 def main():
-    cluster_sets = ['KMeans', 'LDA', 'HDBSCAN', 'German_BERT', 'Patho_BERT', 'top2vec', 'GSDPMM']
+    cluster_sets = ['HDBSCAN'] #['HDBSCAN', 'German_BERT', 'Patho_BERT', 'top2vec', 'GSDPMM']
    # cluster_sets = ['German_BERT']
    df_cases = pd.read_pickle(df_cases_file)
    for cluster_set in cluster_sets:
        # re-generate the topic words:
-        excel_file_path = 'TextClustering/tables/WordsPerCluster_' + cluster_set + '.xlsx'
+        excel_file_path = df_cases_file.replace('.df.pkl', f'_{cluster_set}_topwords.xlsx') #'TextClustering/tables/WordsPerCluster_' + cluster_set + '.xlsx'
        # convert nan-values in int(-1):
        try:
@@ -34,20 +33,12 @@ def main():
            print(f"skipping {cluster_set}. it is not in the df_cases_file.")
            continue
-        if cluster_set in ['German_BERT', 'Patho_BERT', 'top2vec']:
+        report_list = df_cases['preprocessed_text'].tolist()
-            text_corpus_path = path2corpus_embedding_preprocessed
-        else:
-            text_corpus_path = path2corpus_bow_preprocessed
-        meta_params = get_metadata(text_corpus_path)
-        with open(text_corpus_path, 'rb') as f:
-            diag_lst = pickle.load(f)
        # do not apply stopwordfilterg if it is already stopword filtered!
-        generate_save_topicwords(clusters, diag_lst, save_excel_file_path=excel_file_path,
+        generate_save_topicwords(clusters, report_list, save_excel_file_path=excel_file_path,
                                 n_words=10, print_latex_table=print_latex,
-                                 filter_stop_words=filter_stop_words and not meta_params['stopword_filtered'])
+                                 filter_stop_words=filter_stop_words and not stopword_filtered_corpus)
        if save_umap_picture_in_table:
            pic_path = path2umap_pics + cluster_set + "_UMAP.png"
            try:
@@ -66,14 +57,14 @@ def main():
    ######### topic words of authors #########
-    clusters = label_list_as_int_list(df_cases['label_author'])
+'''    clusters = label_list_as_int_list(df_cases['label_author'])
    excel_file_path = 'TextClustering/tables/WordsPerCluster_authors.xlsx'
    with open(path2corpus_bow_preprocessed, 'rb') as f:
-        diag_lst = pickle.load(f)
+        report_list = pickle.load(f)
-    generate_save_topicwords(clusters, diag_lst, save_excel_file_path=excel_file_path,
+    generate_save_topicwords(clusters, report_list, save_excel_file_path=excel_file_path,
                             n_words=20, print_latex_table=print_latex,
-                             filter_stop_words=False)
+                             filter_stop_words=False)'''
 if __name__ == '__main__':

--- a/TextClustering/plot_clustersets.py
+++ b/TextClustering/plot_clustersets.py
@@ -5,11 +5,13 @@ from database_preparation.utils_labeled_datasets import label_list_as_int_list
 clustersets = ["GSDPMM", "KMeans", "LDA", "HDBSCAN",
               "top2vec", "Patho_BERT", "German_BERT"]
+clustersets = ["HDBSCAN"]
 plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)",
               "LDA (UMAP representation)", "HDBSCAN (UMAP representation)",
               "top2vec (UMAP representation)", "Patho-BERT (UMAP representation)",
               "German-BERT (UMAP representation)"]
-df_cases_file = "database/df_cases.pkl"
+plot_titles = ["HDBSCAN (UMAP representation)"]
+df_cases_file = "database/bow_short_diag/bow_short_diag.df.pkl"
 def save_umap_plot(clustersetname, df, title=None):
    if not 'label_' + clustersetname in df:
@@ -26,7 +28,7 @@ def save_umap_plot(clustersetname, df, title=None):
    umap_text_features2D = np.transpose(umap_text_features2D)
    cluster_scatter_plot(umap_text_features2D, predictedCluster_text_features,
-                 "TextClustering/plots/UMAP/" + clustersetname + "_UMAP.png",
+                 df_cases_file.replace('.df.pkl', '_') + clustersetname + "_UMAP.png",
                         show_plot=False, colorblindfriendly=False, fig_title=title)
    if 'label_author' in df:
        author_labels = df["label_author"]

--- a/TextClustering/topic-modeling-analysis.py
+++ b/TextClustering/topic-modeling-analysis.py
+from sklearn.feature_extraction.text import CountVectorizer
+import joblib
+import topicwizard
+from sklearn.decomposition import NMF
+from sklearn.pipeline import make_pipeline
+'''
+isntallation:
+pip install topic-wizard
+'''
+if __name__ == '__main__':
+    vectorizer = CountVectorizer(min_df=5, max_df=0.8, stop_words="english")
+    model = NMF(n_components=10)
+    topic_pipeline = make_pipeline(vectorizer, model)
+    from sklearn.datasets import fetch_20newsgroups
+    newsgroups = fetch_20newsgroups(subset="all")
+    corpus = newsgroups.data
+    # Sklearn gives the labels back as integers, we have to map them back to
+    # the actual textual label.
+    group_labels = [newsgroups.target_names[label] for label in newsgroups.target]
+    topic_pipeline.fit(corpus)
+    print("launching topicwizard visualizer")
+    topicwizard.visualize(corpus, model=topic_pipeline)
+    exit(0)
\ No newline at end of file
--- a/TextClustering/topicwords_table_converter.py
+++ b/TextClustering/topicwords_table_converter.py
 import openpyxl
 from TextClustering.utils_wordlist import get_top_cluster_words_as_latex_table
-from googletrans import Translator  # use pip install googletrans==3.1.0a0, 3.0 version is broken
-from utils_general import custom_translation
 path2table = "WordsPerCluster_HDBSCAN.xlsx"
@@ -12,6 +12,13 @@ black = '1'
 latex_weak_word = '\\weakcolor'
 latex_strong_word = '\\strongcolor'
+translate_to_eng = False
+if translate_to_eng:
+    from googletrans import Translator  # use pip install googletrans==3.1.0a0, 3.0 version is broken
+    from utils_general import custom_translation
 def color2latex_color(color):
    if color == green:
@@ -49,9 +56,9 @@ def main():
    extraction_methods = ['tf-idf', 'SVM']
    cluster_method = 'HDBSCAN'
    anotate_svm_as_tfidf = True
-    print_also_translated_tables = True
-    translator = Translator()
+    if translate_to_eng:
+        translator = Translator()
    words_list_tfidf = []
    colorstfidf = []
@@ -82,7 +89,7 @@ def main():
        print(latex)
        # print english topic words:
-        if print_also_translated_tables:
+        if translate_to_eng:
            description = f'Annotated topic words (translated from German to English), ' \
                          f'extracted from the {cluster_method} cluster-set, ' \
                          f'using the {extraction_method} based extraction method.'

--- a/TextClustering/utils_metrics.py
+++ b/TextClustering/utils_metrics.py
@@ -63,7 +63,7 @@ class ClusterMetrics:
        df = pd.DataFrame(results,
                          index =['s-score', 'ch-index', 'db-score', 'cop', 'dunn-score', 'entropy', 'svm-accuracy'],
                          columns =[tail])
-        df.to_pickle(self.file_name)
+        df.to_csv(self.file_name.replace('.pkl', '') + ".csv")
        print(df)

--- a/TextClustering/utils_wordlist.py
+++ b/TextClustering/utils_wordlist.py
@@ -278,6 +278,16 @@ def get_nwordlist(text_lst, cluster_lst, n_words=10,
 def save_topwordlist_as_excel(file_path, token_list, sheet_name):
    if bool(file_path):
+        token_list_dict = {}
+        for i_cluster, token_list in enumerate(token_list):
+            for i_top, token in enumerate(token_list):
+                key = f"top {i_top + 1}"
+                if key not in token_list_dict:
+                    token_list_dict[key] = []
+                token_list_dict[key].append(token)
+        pandas.DataFrame(token_list_dict).to_csv(file_path.replace('.xlsx', '.csv'), index_label='cluster')
+        return
        try:  # if excelfile does exist, append new sheet to workbook:
            excel_book = pxl.load_workbook(file_path)
            if sheet_name in excel_book.get_sheet_names():
@@ -296,6 +306,8 @@ def save_topwordlist_as_excel(file_path, token_list, sheet_name):
            pandas.DataFrame(token_list).to_excel(file_path, sheet_name=sheet_name)
 def generate_save_topicwords(predictedClusters, text_lst, save_excel_file_path,
                             n_words=10, print_latex_table=False,
                             extraction_methods=['TFIDF-based', 'frequency-based', 'yake', 'svm-based'],

--- a/database/bow_prepro_diag_meta.json
+++ b/database/bow_prepro_diag_meta.json
-{"source_data": "./database/diagnosis", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
--- a/database/preprocessed_reports/bow_prepro_diag_config.json
+++ b/database/preprocessed_reports/bow_prepro_diag_config.json
+{
+  "source_data": {"path_to_dataset": "database/nephro_reports_sectiondivided.newest", "sections": ["conclusion"]},
+  "tokenized": true,
+  "cased": false,
+  "stopword_filtered": true,
+  "use_combiner": true,
+  "use_replacer": true,
+  "lemma_mode": 3,
+  "punct_mode": 2,
+  "number_mode": 3
+}
\ No newline at end of file
--- a/database_preparation/preprocess.py
+++ b/database_preparation/preprocess.py
@@ -12,7 +12,12 @@ from database_preparation.utils_wordbase import RegexpReplacer, RegexpSynonyms
 from database_preparation.stop_word_list import filter_stopwords
 import json
 import argparse
+import pandas as pd
+'''
+# installed: nltk, Hanta, tqdm, numpy
+'''
 ########## define enums ##########
@@ -118,7 +123,8 @@ def preprocess(parameter_dict):
    Histo numbers and dates will always be removed!
    """
-    source_data_path = parameter_dict['source_data']
+    source_data_path = parameter_dict['source_data']['path_to_dataset']
+    sections_to_preprocess = parameter_dict['source_data']['sections']
    do_tokenize = parameter_dict['tokenized']
    cased = parameter_dict['cased']
    stopword_filtered = parameter_dict['stopword_filtered']
@@ -128,7 +134,6 @@ def preprocess(parameter_dict):
    punct_mode = parameter_dict['punct_mode']
    number_mode = parameter_dict['number_mode']
    lemma_mode = LemmatizeMode(lemma_mode)
    punct_mode = PunctuationMode(punct_mode)
    number_mode = NumberMode(number_mode)
@@ -137,17 +142,34 @@ def preprocess(parameter_dict):
    replacer = RegexpReplacer()
    tagger = ht.HanoverTagger('morphmodel_ger.pgz')
-    file_list = glob.glob(source_data_path + '/*.txt')
+    # load the files in a sorted way:
-    file_list = sorted(file_list, key=lambda f: int(f[f.find("#") + 1:-4]))
+    file_list = glob.glob(source_data_path + '/*.json')
+    file_list.sort()
-    preprocessed_corpus = []
+    preprocessed_corpus = {}
+    corpus = {}
    random_example_idx = random.randrange(min(10, len(file_list)))
    for idx, t_file in tqdm(enumerate(file_list)):
-        # %% load the txt-file
+        #  load the txt-file
-        t_text = read_german_text(t_file)
+        # t_text = read_german_text(t_file)
+        case_id = t_file.split('/')[-1].replace('.json', '')
+        # load the json-file
+        with open(t_file) as json_file:
+            report = json.load(json_file)
+            t_text = ""
+            for section in sections_to_preprocess:
+                if section in report:
+                    if report[section]:
+                        t_text += '\n' + report[section]
+        if not t_text:
+            # print(f"Skipping file {t_file} because it does not contain any of the sections: {sections_to_preprocess}")
+            continue
        original_text = t_text
@@ -169,7 +191,7 @@ def preprocess(parameter_dict):
        # lemmatize / stemming
        t_text = tagger.tag_sent(t_text)
-        # %% lemmarize the text
+        #  lemmarize the text
        if lemma_mode == LemmatizeMode.lemma_only_nouns:
            t_text = [lemma for (word, lemma, pos) in t_text if pos == "NN" or pos == "NE"]
        elif lemma_mode == LemmatizeMode.lemma_only_nouns_adja:
@@ -186,13 +208,13 @@ def preprocess(parameter_dict):
        else:  # none
            t_text = [word for (word, lemma, pos) in t_text]
-        # %% filter punctuation:
+        #  filter punctuation:
        if punct_mode == PunctuationMode.remove:
            t_text = [token for token in t_text if token not in punctuations_to_remove]
        elif punct_mode == PunctuationMode.replace:
            t_text = [token if token not in punctuations_to_remove else punct_replace_symbol for token in t_text]
-        # %% number filtering
+        #  number filtering
        filtered_text = []
        use_single_symbol = True
        for i, word in enumerate(t_text):
@@ -280,8 +302,9 @@ def preprocess(parameter_dict):
        if not do_tokenize:
            t_text = ' '.join(t_text)
-        # %% add to the list
+        #  add to the list
-        preprocessed_corpus.append(t_text)
+        preprocessed_corpus[case_id] = t_text
+        corpus[case_id] = original_text
        if idx == random_example_idx:
            print("-------------- Preprocessing Example: ---------------")
@@ -291,7 +314,7 @@ def preprocess(parameter_dict):
            print(t_text)
            print("-----------------------------\n")
-    return preprocessed_corpus
+    return preprocessed_corpus, corpus
 def main():
@@ -300,9 +323,9 @@ def main():
    sys.path.append(os.getcwd())
    parser = argparse.ArgumentParser()
    parser.add_argument("--path_to_preprocessing_params",
-                        default='database/bow_prepro_diag_meta.json')
+                        default='database/preprocessed_reports/bow_prepro_diag_config.json')
    parser.add_argument("--target_path",
-                        default='database/bow_prepro_diag.pkl')
+                        default=None)
    args = parser.parse_args()
    with open(args.path_to_preprocessing_params) as json_file:
@@ -312,12 +335,30 @@ def main():
    print(prepro_params_2_string(params))
    print()
-    preprocessed_corpus = preprocess(params)
+    preprocessed_corpus_dict, corpus_dict = preprocess(params)
+    #with open(args.target_path, 'wb') as f:
+    #    pickle.dump(preprocessed_corpus_dict, f)
+    if args.target_path is None:
+        args.target_path = args.path_to_preprocessing_params.replace('_config', '').replace('config', '')
+    print(f"saved preprocessed corpus at {args.target_path}, containing {len(preprocessed_corpus_dict)} reports.")
+    print(f"Voctabulary size: {len(set([word for c_id in preprocessed_corpus_dict.keys() for word in preprocessed_corpus_dict[c_id]]))}")
-    with open(args.target_path, 'wb') as f:
+    corpus_as_table = {
-        pickle.dump(preprocessed_corpus, f)
+        'case_id': [c_id for c_id in preprocessed_corpus_dict.keys()],
+        'text': [corpus_dict[c_id] for c_id in corpus_dict.keys()],
+        'preprocessed_text': [preprocessed_corpus_dict[c_id] for c_id in preprocessed_corpus_dict.keys()]
+                       }
-    print(f"saved preprocessed corpus at {args.target_path}")
+    # store results:
+    df = pd.DataFrame(corpus_as_table)
+    df.to_pickle(args.target_path.replace('.json', '.df.pkl'))
+    df.to_csv(args.target_path.replace('.json', '.df.csv'))
+    #with open(args.target_path, "w") as json_file:
+    #    json.dump(preprocessed_corpus_dict, json_file, indent=4)
    '''print(get_corpus_stats("../DataNephroTexts/description"))
    print(get_corpus_stats("../DataNephroTexts/diagnosis"))

--- a/database_preparation/utils_labeled_datasets.py
+++ b/database_preparation/utils_labeled_datasets.py
@@ -5,8 +5,6 @@ import os
 import sys
 sys.path.append(os.getcwd())
-import datasets
-import pyarrow as pa
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn.model_selection import KFold
@@ -44,6 +42,9 @@ def text_label_2_labeled_dataset(texts, unfiltered_labels, print_infos=False):
        - converts the passed text-label pair to datastes.Dataset type.
        - returns dataset in format: {"text": labeled_texts, "label": labels}
    '''
+    import datasets
+    import pyarrow as pa
    # collect all text-label pairs, skipping unvalid labels
    labeled_texts = []
    labels = []
@@ -148,6 +149,9 @@ def generate_save_hf_dataset(label_set="LDA", overwrite=True, lower=False):
    and saves it under "./database/labeled_dataframes/labeld_dataset_" + label_set
    '''
+    import datasets
+    import pyarrow as pa
    dataset_path = "./database/labeled_dataframes/labeld_dataset_" + label_set
    if os.path.exists(dataset_path):

--- a/requirements.txt
+++ b/requirements.txt
-numpy==1.21.0
+nltk # for preprocessing
-gensim==4.2.0
+Hanta
-pandas==1.4.2
+tqdm
-matplotlib==3.5.1
+numpy
-tqdm==4.64.0
+scikit-learn # for clustering
-scikit-learn==1.1.1
+matplotlib
-hdbscan==0.8.28
+pandas
-nltk==3.7
+umap-learn
-seaborn==0.11.2
+hdbscan
-validclust==0.1.1
+validclust
-tensorflow-gpu==2.6.0
+pyarrow # for saving and evaluating data
-wordcloud==1.8.2.2
+datasets
-joblib==1.1.0
+pyldavis # for ui-supported topic modeling analysis
-scipy==1.7.3
+openpyxl
-yake==0.4.8
+yake # for topicword search
-openpyxl==3.0.10
\ No newline at end of file
-googletrans==3.1.0a0
-datasets==2.3.2
-transformers==4.21.0.dev0
-dataclasses==0.8
-pyarrow==8.0.0
-keras==2.6.0
-torch==1.11.0
-hanta==0.2.0
\ No newline at end of file