Commit d3fb7ac6 authored by max-laptop's avatar max-laptop

added a pipeline for clustering on our new data-structures (see custom_topic_modeling_pipeline.py)

parent e47051e4
......@@ -4,9 +4,13 @@ from database_preparation.utils_labeled_datasets import label_list_as_int_list
from database_preparation.preprocess import get_metadata
import pickle
import openpyxl
import os, sys
argv = sys.argv
# parameters:
df_cases_file = "data/bow_short_diag/bow_short_diag.df.pkl"
df_cases_file = argv[1]
print_latex = False
filter_stop_words = True
path2umap_pics = 'TextClustering/plots/UMAP/'
......
......@@ -2,6 +2,7 @@ import pandas as pd
from TextClustering.utils_metrics import cluster_scatter_plot
import numpy as np
from database_preparation.utils_labeled_datasets import label_list_as_int_list
import sys, os
clustersets = ["GSDPMM", "KMeans", "LDA", "HDBSCAN",
"top2vec", "Patho_BERT", "German_BERT"]
......@@ -11,8 +12,13 @@ plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)",
"top2vec (UMAP representation)", "Patho-BERT (UMAP representation)",
"German-BERT (UMAP representation)"]
plot_titles = ["HDBSCAN (UMAP representation)"]
argv = sys.argv
df_cases_file = "data/bow_short_diag/bow_short_diag.df.pkl"
df_cases_file = argv[1]
def save_umap_plot(clustersetname, df, title=None):
if not 'label_' + clustersetname in df:
print("skipping " + clustersetname + ", it is not in df_cases_file.")
......
{
"source_data": {"path_to_dataset": "database/nephro_reports_sectiondivided.newest", "sections": ["conclusion"]},
"source_data": {"path_to_dataset": "data/nephro_reports_sectiondivided.newest", "sections": ["conclusion"]},
"tokenized": true,
"cased": false,
"stopword_filtered": true,
......
{
"source_data": {"path_to_dataset": "data/nephro_reports_sectiondivided.newest", "sections": ["short_diagnose", "short_symptoms", "CD73_result"]},
"tokenized": true,
"cased": true,
"stopword_filtered": false,
"use_combiner": false,
"use_replacer": false,
"lemma_mode": 4,
"punct_mode": 2,
"number_mode": 1
}
\ No newline at end of file
{
"source_data": {"path_to_dataset": "database/nephro_reports_sectiondivided.newest", "sections": ["short_diagnose"]},
"source_data": {"path_to_dataset": "data/nephro_reports_sectiondivided.newest", "sections": ["short_diagnose"]},
"tokenized": true,
"cased": true,
"stopword_filtered": false,
......
......@@ -323,7 +323,7 @@ def main():
sys.path.append(os.getcwd())
parser = argparse.ArgumentParser()
parser.add_argument("--path_to_preprocessing_params",
default='data/preprocessed_reports/bow_prepro_diag_config.json')
default='data/bow_diag_clustering/bow_diag_config.json')
parser.add_argument("--target_path",
default=None)
args = parser.parse_args()
......
import os, sys
# params:
#confi_file = "data/bow_short_diag/bow_short_diag_config.json"
#confi_file = "data/bow_diag_clustering/bow_diag_config.json"
confi_file = "data/bow_merged_comments/bow_merged_comments_config.json"
df_file = confi_file.replace('.json', '.df.pkl').replace('_config', '')
# check if we are at correct working directory:
workdir = os.getcwd()
if not workdir[-len('nlp-in-diagnostic-texts-from-nephropathology'):] == 'nlp-in-diagnostic-texts-from-nephropathology':
print(workdir + " is the wrong working directory.")
print("please make shure to run this script with working directory '.../path/to/nlp-in-diagnostic-texts-from-nephropathology'.")
exit(1)
# Construct clustering pipeline. This is a suggestion how to use all the scripts.
# I also recommend to run each clustering script one by on to fintune the clusterings (with argument --find_k_value)
script_queue = [
f"python database_preparation/preprocess.py --path_to_preprocessing_params {confi_file}",
f"python TextClustering/basedOn_BOW/HDBSCAN_Diagnosis.py --path2corpus {df_file} --k_value {20}",
f"python TextClustering/plot_clustersets.py '{df_file}'",
f"python TextClustering/generate_topicwords.py '{df_file}'",
]
for script in script_queue:
print("\n########################################### executing ###########################################")
print(script)
print("####################################################################################################\n")
os.system(script)
......@@ -17,16 +17,11 @@ if __name__ == '__main__':
topic_pipeline = make_pipeline(vectorizer, model)
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset="all")
corpus = newsgroups.data
# Sklearn gives the labels back as integers, we have to map them back to
# the actual textual label.
group_labels = [newsgroups.target_names[label] for label in newsgroups.target]
topic_pipeline.fit(corpus)
print("launching topicwizard visualizer")
print("launching topicwizard visualizer (may take a while)...")
topicwizard.visualize(corpus, model=topic_pipeline)
exit(0)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment