added a pipeline for clustering on our new data-structures (see custom_topic_modeling_pipeline.py)

d3fb7ac6 · max-laptop · e47051e4 · d3fb7ac6 · d3fb7ac6 · d3fb7ac6
Commit d3fb7ac6 authored Dec 19, 2024 by max-laptop
8 changed files
--- a/TextClustering/generate_topicwords.py
+++ b/TextClustering/generate_topicwords.py
@@ -4,9 +4,13 @@ from database_preparation.utils_labeled_datasets import label_list_as_int_list
 from database_preparation.preprocess import get_metadata
 import pickle
 import openpyxl
+import os, sys
+argv = sys.argv
 # parameters:
 df_cases_file = "data/bow_short_diag/bow_short_diag.df.pkl"
+df_cases_file = argv[1]
 print_latex = False
 filter_stop_words = True
 path2umap_pics = 'TextClustering/plots/UMAP/'

--- a/TextClustering/plot_clustersets.py
+++ b/TextClustering/plot_clustersets.py
@@ -2,6 +2,7 @@ import pandas as pd
 from TextClustering.utils_metrics import cluster_scatter_plot
 import numpy as np
 from database_preparation.utils_labeled_datasets import label_list_as_int_list
+import sys, os
 clustersets = ["GSDPMM", "KMeans", "LDA", "HDBSCAN",
               "top2vec", "Patho_BERT", "German_BERT"]
@@ -11,8 +12,13 @@ plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)",
               "top2vec (UMAP representation)", "Patho-BERT (UMAP representation)",
               "German-BERT (UMAP representation)"]
 plot_titles = ["HDBSCAN (UMAP representation)"]
+argv = sys.argv
 df_cases_file = "data/bow_short_diag/bow_short_diag.df.pkl"
+df_cases_file = argv[1]
 def save_umap_plot(clustersetname, df, title=None):
    if not 'label_' + clustersetname in df:
        print("skipping " + clustersetname + ", it is not in df_cases_file.")

--- a/data/preprocessed_reports/bow_prepro_diag_config.json
+++ b/data/preprocessed_reports/bow_prepro_diag_config.json
 {
-  "source_data": {"path_to_dataset": "database/nephro_reports_sectiondivided.newest", "sections": ["conclusion"]},
+  "source_data": {"path_to_dataset": "data/nephro_reports_sectiondivided.newest", "sections": ["conclusion"]},
  "tokenized": true,
  "cased": false,
  "stopword_filtered": true,

--- a/data/bow_merged_comments/bow_merged_comments_config.json
+++ b/data/bow_merged_comments/bow_merged_comments_config.json
+{
+  "source_data": {"path_to_dataset": "data/nephro_reports_sectiondivided.newest", "sections": ["short_diagnose", "short_symptoms", "CD73_result"]},
+  "tokenized": true,
+  "cased": true,
+  "stopword_filtered": false,
+  "use_combiner": false,
+  "use_replacer": false,
+  "lemma_mode": 4,
+  "punct_mode": 2,
+  "number_mode": 1
+}
\ No newline at end of file
--- a/data/bow_short_diag/bow_short_diag_config.json
+++ b/data/bow_short_diag/bow_short_diag_config.json
 {
-  "source_data": {"path_to_dataset": "database/nephro_reports_sectiondivided.newest", "sections": ["short_diagnose"]},
+  "source_data": {"path_to_dataset": "data/nephro_reports_sectiondivided.newest", "sections": ["short_diagnose"]},
  "tokenized": true,
  "cased": true,
  "stopword_filtered": false,

--- a/database_preparation/preprocess.py
+++ b/database_preparation/preprocess.py
@@ -323,7 +323,7 @@ def main():
    sys.path.append(os.getcwd())
    parser = argparse.ArgumentParser()
    parser.add_argument("--path_to_preprocessing_params",
-                        default='data/preprocessed_reports/bow_prepro_diag_config.json')
+                        default='data/bow_diag_clustering/bow_diag_config.json')
    parser.add_argument("--target_path",
                        default=None)
    args = parser.parse_args()

--- a/topic_modeling/custom_topic_modeling_pipeline.py
+++ b/topic_modeling/custom_topic_modeling_pipeline.py
+import os, sys
+# params:
+#confi_file = "data/bow_short_diag/bow_short_diag_config.json"
+#confi_file = "data/bow_diag_clustering/bow_diag_config.json"
+confi_file = "data/bow_merged_comments/bow_merged_comments_config.json"
+df_file = confi_file.replace('.json', '.df.pkl').replace('_config', '')
+# check if we are at correct working directory:
+workdir = os.getcwd()
+if not workdir[-len('nlp-in-diagnostic-texts-from-nephropathology'):] == 'nlp-in-diagnostic-texts-from-nephropathology':
+    print(workdir + " is the wrong working directory.")
+    print("please make shure to run this script with working directory '.../path/to/nlp-in-diagnostic-texts-from-nephropathology'.")
+    exit(1)
+# Construct clustering pipeline. This is a suggestion how to use all the scripts.
+# I also recommend to run each clustering script one by on to fintune the clusterings (with argument --find_k_value)
+script_queue = [
+f"python database_preparation/preprocess.py --path_to_preprocessing_params {confi_file}",
+f"python TextClustering/basedOn_BOW/HDBSCAN_Diagnosis.py --path2corpus {df_file} --k_value {20}",
+f"python TextClustering/plot_clustersets.py '{df_file}'",
+f"python TextClustering/generate_topicwords.py '{df_file}'",
+]
+for script in script_queue:
+    print("\n########################################### executing ###########################################")
+    print(script)
+    print("####################################################################################################\n")
+    os.system(script)
--- a/topic_modeling/topic-modeling-analysis.py
+++ b/topic_modeling/topic-modeling-analysis.py
@@ -17,16 +17,11 @@ if __name__ == '__main__':
    topic_pipeline = make_pipeline(vectorizer, model)
-    from sklearn.datasets import fetch_20newsgroups
-    newsgroups = fetch_20newsgroups(subset="all")
-    corpus = newsgroups.data
-    # Sklearn gives the labels back as integers, we have to map them back to
-    # the actual textual label.
-    group_labels = [newsgroups.target_names[label] for label in newsgroups.target]
    topic_pipeline.fit(corpus)
-    print("launching topicwizard visualizer")
+    print("launching topicwizard visualizer (may take a while)...")
    topicwizard.visualize(corpus, model=topic_pipeline)
    exit(0)
\ No newline at end of file