Commit d3fb7ac6 authored by max-laptop's avatar max-laptop

added a pipeline for clustering on our new data-structures (see custom_topic_modeling_pipeline.py)

parent e47051e4
...@@ -4,9 +4,13 @@ from database_preparation.utils_labeled_datasets import label_list_as_int_list ...@@ -4,9 +4,13 @@ from database_preparation.utils_labeled_datasets import label_list_as_int_list
from database_preparation.preprocess import get_metadata from database_preparation.preprocess import get_metadata
import pickle import pickle
import openpyxl import openpyxl
import os, sys
argv = sys.argv
# parameters: # parameters:
df_cases_file = "data/bow_short_diag/bow_short_diag.df.pkl" df_cases_file = "data/bow_short_diag/bow_short_diag.df.pkl"
df_cases_file = argv[1]
print_latex = False print_latex = False
filter_stop_words = True filter_stop_words = True
path2umap_pics = 'TextClustering/plots/UMAP/' path2umap_pics = 'TextClustering/plots/UMAP/'
......
...@@ -2,6 +2,7 @@ import pandas as pd ...@@ -2,6 +2,7 @@ import pandas as pd
from TextClustering.utils_metrics import cluster_scatter_plot from TextClustering.utils_metrics import cluster_scatter_plot
import numpy as np import numpy as np
from database_preparation.utils_labeled_datasets import label_list_as_int_list from database_preparation.utils_labeled_datasets import label_list_as_int_list
import sys, os
clustersets = ["GSDPMM", "KMeans", "LDA", "HDBSCAN", clustersets = ["GSDPMM", "KMeans", "LDA", "HDBSCAN",
"top2vec", "Patho_BERT", "German_BERT"] "top2vec", "Patho_BERT", "German_BERT"]
...@@ -11,8 +12,13 @@ plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)", ...@@ -11,8 +12,13 @@ plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)",
"top2vec (UMAP representation)", "Patho-BERT (UMAP representation)", "top2vec (UMAP representation)", "Patho-BERT (UMAP representation)",
"German-BERT (UMAP representation)"] "German-BERT (UMAP representation)"]
plot_titles = ["HDBSCAN (UMAP representation)"] plot_titles = ["HDBSCAN (UMAP representation)"]
argv = sys.argv
df_cases_file = "data/bow_short_diag/bow_short_diag.df.pkl" df_cases_file = "data/bow_short_diag/bow_short_diag.df.pkl"
df_cases_file = argv[1]
def save_umap_plot(clustersetname, df, title=None): def save_umap_plot(clustersetname, df, title=None):
if not 'label_' + clustersetname in df: if not 'label_' + clustersetname in df:
print("skipping " + clustersetname + ", it is not in df_cases_file.") print("skipping " + clustersetname + ", it is not in df_cases_file.")
......
{ {
"source_data": {"path_to_dataset": "database/nephro_reports_sectiondivided.newest", "sections": ["conclusion"]}, "source_data": {"path_to_dataset": "data/nephro_reports_sectiondivided.newest", "sections": ["conclusion"]},
"tokenized": true, "tokenized": true,
"cased": false, "cased": false,
"stopword_filtered": true, "stopword_filtered": true,
......
{
"source_data": {"path_to_dataset": "data/nephro_reports_sectiondivided.newest", "sections": ["short_diagnose", "short_symptoms", "CD73_result"]},
"tokenized": true,
"cased": true,
"stopword_filtered": false,
"use_combiner": false,
"use_replacer": false,
"lemma_mode": 4,
"punct_mode": 2,
"number_mode": 1
}
\ No newline at end of file
{ {
"source_data": {"path_to_dataset": "database/nephro_reports_sectiondivided.newest", "sections": ["short_diagnose"]}, "source_data": {"path_to_dataset": "data/nephro_reports_sectiondivided.newest", "sections": ["short_diagnose"]},
"tokenized": true, "tokenized": true,
"cased": true, "cased": true,
"stopword_filtered": false, "stopword_filtered": false,
......
...@@ -323,7 +323,7 @@ def main(): ...@@ -323,7 +323,7 @@ def main():
sys.path.append(os.getcwd()) sys.path.append(os.getcwd())
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--path_to_preprocessing_params", parser.add_argument("--path_to_preprocessing_params",
default='data/preprocessed_reports/bow_prepro_diag_config.json') default='data/bow_diag_clustering/bow_diag_config.json')
parser.add_argument("--target_path", parser.add_argument("--target_path",
default=None) default=None)
args = parser.parse_args() args = parser.parse_args()
......
import os, sys
# params:
#confi_file = "data/bow_short_diag/bow_short_diag_config.json"
#confi_file = "data/bow_diag_clustering/bow_diag_config.json"
confi_file = "data/bow_merged_comments/bow_merged_comments_config.json"
df_file = confi_file.replace('.json', '.df.pkl').replace('_config', '')
# check if we are at correct working directory:
workdir = os.getcwd()
if not workdir[-len('nlp-in-diagnostic-texts-from-nephropathology'):] == 'nlp-in-diagnostic-texts-from-nephropathology':
print(workdir + " is the wrong working directory.")
print("please make shure to run this script with working directory '.../path/to/nlp-in-diagnostic-texts-from-nephropathology'.")
exit(1)
# Construct clustering pipeline. This is a suggestion how to use all the scripts.
# I also recommend to run each clustering script one by on to fintune the clusterings (with argument --find_k_value)
script_queue = [
f"python database_preparation/preprocess.py --path_to_preprocessing_params {confi_file}",
f"python TextClustering/basedOn_BOW/HDBSCAN_Diagnosis.py --path2corpus {df_file} --k_value {20}",
f"python TextClustering/plot_clustersets.py '{df_file}'",
f"python TextClustering/generate_topicwords.py '{df_file}'",
]
for script in script_queue:
print("\n########################################### executing ###########################################")
print(script)
print("####################################################################################################\n")
os.system(script)
...@@ -17,16 +17,11 @@ if __name__ == '__main__': ...@@ -17,16 +17,11 @@ if __name__ == '__main__':
topic_pipeline = make_pipeline(vectorizer, model) topic_pipeline = make_pipeline(vectorizer, model)
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset="all")
corpus = newsgroups.data
# Sklearn gives the labels back as integers, we have to map them back to
# the actual textual label.
group_labels = [newsgroups.target_names[label] for label in newsgroups.target]
topic_pipeline.fit(corpus) topic_pipeline.fit(corpus)
print("launching topicwizard visualizer") print("launching topicwizard visualizer (may take a while)...")
topicwizard.visualize(corpus, model=topic_pipeline) topicwizard.visualize(corpus, model=topic_pipeline)
exit(0) exit(0)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment