Now split_reports.py and cluster plots should work better with golden labels

c7bab612 · max · 8ed471a1 · c7bab612 · c7bab612 · c7bab612
Commit c7bab612 authored Sep 26, 2022 by max
7 changed files
--- a/TextClustering/clusterset_histos.py
+++ b/TextClustering/clusterset_histos.py
@@ -14,10 +14,9 @@ plot_author_histos = False
 cluster = 2
 clustersets = ["HDBSCAN", "KMeans", "LDA", "GSDPMM",
-               "top2vec", "Patho_BERT", "German_BERT"]
+               "top2vec", "Patho_BERT", "German_BERT", "golden"]
 df = pd.read_pickle(args.df_cases_file)
-authors_labels = df["label_author"]
 # plot histograms: how much docs do have the same label=cluster-index?
 for i,label_set in enumerate(clustersets):
@@ -27,19 +26,24 @@ for i,label_set in enumerate(clustersets):
    except:
        print(f"skipping {label_set}. it is not in the df_cases_file.")
        continue
    if plot_author_histos:
+        if 'label_author' in df:
+            authors_labels = df["label_author"]
-        authors_of_cluster = [authors_labels[i] for i, label in enumerate(cluster_labels) if
+            authors_of_cluster = [authors_labels[i] for i, label in enumerate(cluster_labels) if
-                              label == cluster]
+                                  label == cluster]
-        authors = np.asarray(authors_of_cluster)
+            authors = np.asarray(authors_of_cluster)
-        x = [-1,0,1,2,3]
+            x = [-1,0,1,2,3]
-        h = []
+            h = []
-        for l in x:
+            for l in x:
-            h.append(sum([1 for a in authors if a == l]))
+                h.append(sum([1 for a in authors if a == l]))
-        plt.bar(x, height=h)
+            plt.bar(x, height=h)
-        plt.title(label_set + " authors in cluster " + str(cluster))
+            plt.title(label_set + " authors in cluster " + str(cluster))
-        file_path = 'TextClustering/plots/histograms/histogram_' + label_set + "_cluster" + str(cluster) + "_authors.png"
+            file_path = 'TextClustering/plots/histograms/histogram_' + label_set + "_cluster" + str(cluster) + "_authors.png"
+        else:
+            print(f'Cant plot author histos, there is not "label_author" in df_cases.')
    else:
        labels = np.asarray([l for l in cluster_labels if l != -1])

--- a/TextClustering/plot_clustersets.py
+++ b/TextClustering/plot_clustersets.py
@@ -13,8 +13,7 @@ df_cases_file = "database/df_cases.pkl"
 def save_umap_plot(clustersetname, df, title=None):
    if not 'label_' + clustersetname in df:
-        print("skipping " + clustersetname + ", it is not in df_cases_file:")
+        print("skipping " + clustersetname + ", it is not in df_cases_file.")
-        print(df)
        return
    predictedCluster_text_features = label_list_as_int_list(df['label_' + clustersetname])
@@ -39,7 +38,7 @@ def save_umap_plot(clustersetname, df, title=None):
        golden_labels = df["label_golden"]
        cluster_scatter_plot(umap_text_features2D, golden_labels,
                     "TextClustering/plots/UMAP/" + clustersetname + "_UMAP_goldenlabel.png",
-                             show_plot=False, colorblindfriendly=True
+                             show_plot=False, colorblindfriendly=False
                             , fig_title=title + " colored with golden labels")
 def main():
@@ -63,10 +62,11 @@ def main():
                     "TextClustering/plots/PCA/LDA_PCA.png",
                             show_plot=False, colorblindfriendly=False,
                             fig_title="LDA (PCA representation)")
-        cluster_scatter_plot(features2D, df["label_author"],
+        if 'label_author' in df:
-                     "TextClustering/plots/PCA/LDA_PCA_authors.png",
+            cluster_scatter_plot(features2D, df["label_author"],
-                             show_plot=False, colorblindfriendly=True,
+                                 "TextClustering/plots/PCA/LDA_PCA_authors.png",
-                             number_data_points=False, fig_title='LDA (PCA representation), colored by authors')
+                                 show_plot=False, colorblindfriendly=True,
+                                 number_data_points=False, fig_title='LDA (PCA representation), colored by authors')
 if __name__ == '__main__':
    main()
--- a/TextClustering/utils_metrics.py
+++ b/TextClustering/utils_metrics.py
@@ -169,7 +169,12 @@ def cluster_scatter_plot(umap_text_features2D, labels, file_path = [],
            nummerate_clusters_in_plot(x,y,c)
        '''plt.legend(handles=scatter.legend_elements()[0],
                   labels=[str(l) for l in c], loc="best")'''
-        plt.colorbar(values=[int(e) for e in np.unique(np.asarray(c))])
+        v = [int(e) for e in np.unique(np.asarray(c))]
+        try:
+            plt.colorbar(values=v)
+        except Exception as message:
+            print(f'failed to generate colorbar for {file_path}')
+            print(f'{message}')
        c = [e for e in valid_labels if e > 19]
        if len(c)>0:

--- a/database/bow_prepro_desc_meta.json
+++ b/database/bow_prepro_desc_meta.json
-{"source_data": "../DataNephroTexts/description", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
+{"source_data": "./database/description", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
--- a/database/bow_prepro_diag_meta.json
+++ b/database/bow_prepro_diag_meta.json
-{"source_data": "../DataNephroTexts/diagnosis", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
+{"source_data": "./database/diagnosis", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
--- a/database_preparation/reportPreparationTools.py
+++ b/database_preparation/reportPreparationTools.py
@@ -19,7 +19,7 @@ def get3parts(t_file):
    #%% get start codons for description and diagnosis
    # thinking that one pathologist sticks to his/her wording
    start_codon_description = find_codon(t_text,
-                                         ['Lichtmikroskopie:', 'Mikroskopie:', "Histologie:", "Klinische Angaben:", "Wir erhielten"])
+                                         ['Lichtmikroskopie:', 'Mikroskopie:', "Histologie:", "Klinische Angaben:", "Wir erhielten ", "Eingesandt wurde:"])
    start_codon_2nd = find_codon(t_text,
                                 ["Nachbericht", "Immunhistochemie"])

--- a/database_preparation/split_reports.py
+++ b/database_preparation/split_reports.py
@@ -41,6 +41,7 @@ parser.add_argument("--text_encoding",
 parser.add_argument("--use_newest_reports", action='store_true')
 parser.add_argument("--label_in_filename_symbol",
                        default='')
+parser.add_argument("--save_labels_as_ints", action='store_true')
 args = parser.parse_args()
 # %% get all files
@@ -119,7 +120,8 @@ for idx, t_file in tqdm(enumerate(report_file_list)):
                  encoding=args.text_encoding) as text_file:
            text_file.write(label)
-        lst_labels.append('label#' + str(idx) + ".txt")
+        # lst_labels.append('label#' + str(idx) + ".txt")
+        lst_labels.append(label)
    lst_description.append('description#' + str(idx) + ".txt")
    lst_diagnose.append('diagnosis#' + str(idx) + ".txt")
@@ -146,18 +148,29 @@ if args.label_in_filename_symbol == '':
    df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end)),
                  columns=['description_text_files', 'diagnosis_text_files', 'end_text_files'])
 else:
-    df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end, lst_labels)),
+    if args.save_labels_as_ints:
-                      columns=['description_text_files', 'diagnosis_text_files', 'end_text_files', 'label_files'])
+        class2int = dict()
+        for i, c in enumerate(list(set(lst_labels))):
+            class2int[c] = i
+        df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end, [class2int[l] for l in lst_labels])),
+                          columns=['description_text_files', 'diagnosis_text_files', 'end_text_files', 'label_golden'])
+        with open(args.target_folder_path + "/classname2integer.txt", "w") as text_file:
+            text_file.write('\n'.join([f'{class2int[name]}\t\t{name}' for name in list(set(lst_labels))]))
+    else:
+        df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end, lst_labels)),
+                          columns=['description_text_files', 'diagnosis_text_files', 'end_text_files', 'label_golden'])
 df.to_pickle(args.df_cases_file)
 # search for authors in end-sections in order to add them as labels to the df_cases file:
-try:
+if args.label_in_filename_symbol == '':
-    add_author_labels_to_df_cases(args.target_folder_path + '/end', args.author_names.split(' '), args.df_cases_file)
+    try:
-except:
+        add_author_labels_to_df_cases(args.target_folder_path + '/end', args.author_names.split(' '), args.df_cases_file)
-    print("label the reports with authors failed.")
+    except:
+        print("label the reports with authors failed.")
 df = pd.read_pickle(args.df_cases_file)
 print(f"saved df_cases at {args.df_cases_file}\n")
 print(f'it looks like this:')
 print(df)
+print(f"first element of {args.df_cases_file}: \n{df.iloc[0]}")
 print()