Commit e47051e4 authored by max-laptop's avatar max-laptop

renamed folder "database" to "data"

parent 1245ae68
......@@ -9,7 +9,7 @@ def argsparse_preamble():
parser.add_argument("--overwrite", action='store_true')#False: generate data only if it doesn already exist
parser.add_argument("--show_figures", action='store_true')
parser.add_argument("--clustered_data", default="HDBSCAN")
parser.add_argument("--path2corpus", default="database/bow_prepro_desc.pkl")
parser.add_argument("--path2corpus", default="data/bow_prepro_desc.pkl")
parser.add_argument("--loss_curve_check", action='store_true')
args = parser.parse_args()
......
......@@ -57,8 +57,8 @@ def get_immediate_subdirectories(a_dir):
def cross_validate_with_bow_classifiers(label_set, fold_amount=10,
path2corpus="./database/bow_prepro_desc.pkl",
df_cases_file="database/df_cases.pkl"):
path2corpus="./data/bow_prepro_desc.pkl",
df_cases_file="data/df_cases.pkl"):
'''
cross validates passed label_set with text data saved in path2corpus and labels saved in df_cases_file.
......
......@@ -47,7 +47,7 @@ def create_pipeline(estimator, reduction=False):
steps.append(('classifier', estimator))
return Pipeline(steps)
def cross_validate_with_simple_SVM(label_set, path2corpus = "./database/bow_prepro_diag.pkl", path2dfcases='./database/df_cases.pkl'):
def cross_validate_with_simple_SVM(label_set, path2corpus = "./data/bow_prepro_diag.pkl", path2dfcases='./data/df_cases.pkl'):
"""
trains a simple SVM with the given data
returns 10-fold-cross-validated accuracy value
......@@ -96,7 +96,7 @@ def cross_validate_with_simple_SVM(label_set, path2corpus = "./database/bow_prep
return metrics
def cross_validate_label_corpus_with_simple_SVM(labels, path2corpus = "./database/bow_prepro_diag.pkl", sample = True):
def cross_validate_label_corpus_with_simple_SVM(labels, path2corpus = "./data/bow_prepro_diag.pkl", sample = True):
"""
trains a simple SVM with the given data
returns 10-fold-cross-validated accuracy value
......@@ -145,7 +145,7 @@ def cross_validate_label_corpus_with_simple_SVM(labels, path2corpus = "./databas
return np.mean(metrics.scores['accuracy'])
def train_SVM_with_clusterset(label_set, path2corpus = "./database/bow_prepro_diag.pkl", path2dfcases='./database/df_cases.pkl'):
def train_SVM_with_clusterset(label_set, path2corpus = "./data/bow_prepro_diag.pkl", path2dfcases='./data/df_cases.pkl'):
"""
trains ans saves a svm, trained with the whole data under as:
"./ModelTestingAndExplaining/models/SVM_trained_with_" + label_set + "_clustered.pkl"
......@@ -207,15 +207,15 @@ def update_cls_metric(label_set, cls_accuracy):
def update_cls_metric_for_each_clusterset():
'''
does 10-fold-cross-validation with a svm for each cluster-set saved in './database/df_cases.pkl'
using always the text in 'database/diag_lst_tokenized.pkl'
does 10-fold-cross-validation with a svm for each cluster-set saved in './data/df_cases.pkl'
using always the text in 'data/diag_lst_tokenized.pkl'
'''
label_sets = dt.get_all_label_set_ids()
# label_sets = ["German_BERT"]
for label_set in label_sets:
accuracy = np.mean(cross_validate_with_simple_SVM(label_set,
'database/diag_lst_tokenized.pkl',
'./database/df_cases.pkl').scores['accuracy'])
'data/diag_lst_tokenized.pkl',
'./data/df_cases.pkl').scores['accuracy'])
print("svm-cls-accuracy of cluster set "+label_set+": "+str(accuracy))
update_cls_metric(label_set, accuracy)
......@@ -223,10 +223,10 @@ def update_cls_metric_for_each_clusterset():
def main():
#update_cls_metric_for_each_clusterset()
cluster_set_name = "German_BERT"
#text_data = 'database/darmischaemie_prostata_txt_lst.pkl' cluster_set_dict = './database/df_cases2.pkl'
text_data = 'database/diag_lst.pkl'
#text_data = 'database/diag_lst_tokenized.pkl'
cluster_set_dict = './database/df_cases.pkl'
#text_data = 'data/darmischaemie_prostata_txt_lst.pkl' cluster_set_dict = './data/df_cases2.pkl'
text_data = 'data/diag_lst.pkl'
#text_data = 'data/diag_lst_tokenized.pkl'
cluster_set_dict = './data/df_cases.pkl'
train_SVM_with_clusterset(cluster_set_name, text_data, cluster_set_dict)
......
......@@ -6,8 +6,8 @@ import os
cluster_sets = ['HDBSCAN']
# params:
path2corpus_bow_preprocessed = 'database/bow_prepro_desc.pkl'
path2corpus_embedding_preprocessed = 'database/embedding_prepro_desc.pkl'
path2corpus_bow_preprocessed = 'data/bow_prepro_desc.pkl'
path2corpus_embedding_preprocessed = 'data/embedding_prepro_desc.pkl'
#check working directory:
......
......@@ -11,8 +11,8 @@ def argsparse_preamble():
parser.add_argument("--show_figures", action='store_true')
parser.add_argument("--model2use", default="German_BERT")
parser.add_argument('--do_embedding', action='store_true')
parser.add_argument("--path2corpus", default='database/bow_prepro_diag.pkl')
parser.add_argument("--df_cases_file", default='database/df_cases.pkl')
parser.add_argument("--path2corpus", default='data/bow_prepro_diag.pkl')
parser.add_argument("--df_cases_file", default='data/df_cases.pkl')
args = parser.parse_args()
......
......@@ -26,7 +26,7 @@ from sentence_transformers import SentenceTransformer
from database_preparation.preprocess import print_meta_data
embedding_backup_folder = "database/backup_files/"
embedding_backup_folder = "data/backup_files/"
if not os.path.isdir(embedding_backup_folder):
os.makedirs(embedding_backup_folder)
path_2_pathoBERT = "./LanguageModelling/ger-patho-bert-2"
......
......@@ -14,10 +14,10 @@ recalc_cls_accuracy = True
use_always_bow_data_for_svm_accuracy = True
sort_table_by = ['s-score'] # s-score or cls accuracy
table_save_path = 'TextClustering/tables/cluster_metrics_overview'
path2corpus_bow_preprocessed = 'database/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed = 'database/embedding_prepro_diag.pkl'
path2corpus_bow_preprocessed = 'data/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed = 'data/embedding_prepro_diag.pkl'
scorepath = "TextClustering/cluster_metrics/"
df_cases_file = './database/df_cases.pkl'
df_cases_file = './data/df_cases.pkl'
def main():
......
import os, sys
# params:
path2corpus_bow_preprocessed_diagnosis = 'database/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed_diagnosis = 'database/embedding_prepro_diag.pkl'
path2corpus_bow_preprocessed_diagnosis = 'data/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed_diagnosis = 'data/embedding_prepro_diag.pkl'
# check if we are at correct working directory:
workdir = os.getcwd()
......
......@@ -7,7 +7,7 @@ import argparse
sys.path.append(os.getcwd())
parser = argparse.ArgumentParser()
parser.add_argument("--df_cases_file", default="database/df_cases.pkl")
parser.add_argument("--df_cases_file", default="data/df_cases.pkl")
args = parser.parse_args()
plot_author_histos = False
......
......@@ -6,7 +6,7 @@ import pickle
import openpyxl
# parameters:
df_cases_file = "database/bow_short_diag/bow_short_diag.df.pkl"
df_cases_file = "data/bow_short_diag/bow_short_diag.df.pkl"
print_latex = False
filter_stop_words = True
path2umap_pics = 'TextClustering/plots/UMAP/'
......
......@@ -11,7 +11,7 @@ plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)",
"top2vec (UMAP representation)", "Patho-BERT (UMAP representation)",
"German-BERT (UMAP representation)"]
plot_titles = ["HDBSCAN (UMAP representation)"]
df_cases_file = "database/bow_short_diag/bow_short_diag.df.pkl"
df_cases_file = "data/bow_short_diag/bow_short_diag.df.pkl"
def save_umap_plot(clustersetname, df, title=None):
if not 'label_' + clustersetname in df:
......@@ -23,7 +23,7 @@ def save_umap_plot(clustersetname, df, title=None):
umap_text_features2D = np.asarray([[e for e in df['umapX_' + clustersetname]],
[e for e in df['umapY_' + clustersetname]]])
except:
print("there is no umapX_" + clustersetname + " in database/df_cases.pkl. => skipping")
print("there is no umapX_" + clustersetname + " in data/df_cases.pkl. => skipping")
return
umap_text_features2D = np.transpose(umap_text_features2D)
......
......@@ -376,7 +376,7 @@ def print_excel_topciwordlist_as_latex(cluster_set, excel_file_path=None, topic_
if excel_file_path == None:
excel_file_path = 'TextClustering/tables/WordsPerCluster_' + cluster_set + '_temp.xlsx'
df_cases = pd.read_pickle("database/df_cases.pkl")
df_cases = pd.read_pickle("data/df_cases.pkl")
# convert nan-values in int(-1):
clusters = label_list_as_int_list(df_cases['label_' + cluster_set])
......
......@@ -4,8 +4,8 @@ import pickle
# script parameters:
modelname = "bert-base-german-cased"
path2corpus_embedding_preprocessed_diagnosis = 'database/embedding_prepro_diag.pkl'
path2corpus_embedding_preprocessed_description = 'database/embedding_prepro_desc.pkl'
path2corpus_embedding_preprocessed_diagnosis = 'data/embedding_prepro_diag.pkl'
path2corpus_embedding_preprocessed_description = 'data/embedding_prepro_desc.pkl'
tokenizer = AutoTokenizer.from_pretrained(modelname)
model = AutoModelForMaskedLM.from_pretrained(modelname)
......
......@@ -5,10 +5,10 @@ import os
path_to_reports = '../DataNephroTexts/reports'
author_names = "Name1 Name2 Name3 Name4" ## names of the pathologists who wrote the reports
splitted_reports_folder_path = '../DataNephroTexts'
path2corpus_bow_preprocessed_diagnosis = 'database/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed_diagnosis = 'database/embedding_prepro_diag.pkl'
path2corpus_bow_preprocessed_description = 'database/bow_prepro_desc.pkl'
path2corpus_embedding_preprocessed_description = 'database/embedding_prepro_desc.pkl'
path2corpus_bow_preprocessed_diagnosis = 'data/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed_diagnosis = 'data/embedding_prepro_diag.pkl'
path2corpus_bow_preprocessed_description = 'data/bow_prepro_desc.pkl'
path2corpus_embedding_preprocessed_description = 'data/embedding_prepro_desc.pkl'
# check if we are at correct working directory:
workdir = os.getcwd()
......
......@@ -34,7 +34,7 @@ def get_names(text):
names.append(token)
return names
def add_author_labels_to_df_cases(path_to_end_sections, authors, df_cases_file = "database/df_cases.pkl"):
def add_author_labels_to_df_cases(path_to_end_sections, authors, df_cases_file = "data/df_cases.pkl"):
df = pd.read_pickle(df_cases_file)
filenames = df["end_text_files"]
author_labels = []
......
......@@ -323,7 +323,7 @@ def main():
sys.path.append(os.getcwd())
parser = argparse.ArgumentParser()
parser.add_argument("--path_to_preprocessing_params",
default='database/preprocessed_reports/bow_prepro_diag_config.json')
default='data/preprocessed_reports/bow_prepro_diag_config.json')
parser.add_argument("--target_path",
default=None)
args = parser.parse_args()
......
......@@ -6,16 +6,16 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from database_preparation.preprocess import print_meta_data, prepro_params_2_string
# parameters:
df_cases_file = "database/df_cases.pkl"
text_corpus_paths = ['database/embedding_prepro_diag.pkl',
'database/bow_prepro_diag.pkl',
'database/embedding_prepro_desc.pkl',
'database/bow_prepro_desc.pkl']
vector_corpus_paths = ['database/diagnosis_texts_vectorized_DR_preprocessed.pkl',
'database/diagnosis_texts_vectorized_bow_preprocessed.pkl',
'database/description_texts_vectorized_DR_preprocessed.pkl',
'database/description_texts_vectorized_bow_preprocessed.pkl']
df_cases_file = "data/df_cases.pkl"
text_corpus_paths = ['data/embedding_prepro_diag.pkl',
'data/bow_prepro_diag.pkl',
'data/embedding_prepro_desc.pkl',
'data/bow_prepro_desc.pkl']
vector_corpus_paths = ['data/diagnosis_texts_vectorized_DR_preprocessed.pkl',
'data/diagnosis_texts_vectorized_bow_preprocessed.pkl',
'data/description_texts_vectorized_DR_preprocessed.pkl',
'data/description_texts_vectorized_bow_preprocessed.pkl']
####### functions ##########
......
......@@ -33,7 +33,7 @@ parser.add_argument("--path_to_reports",
parser.add_argument("--target_folder_path",
default='../DataNephroTexts')
parser.add_argument("--df_cases_file",
default='database/df_cases.pkl')
default='data/df_cases.pkl')
parser.add_argument("--author_names",
default="Name1 Name2")
parser.add_argument("--text_encoding",
......
......@@ -74,8 +74,8 @@ def text_label_2_labeled_dataset(texts, unfiltered_labels, print_infos=False):
def text_label_files_to_labeled_dataset(label_set,
path2corpus="./database/bow_prepro_desc.pkl",
df_cases_path="./database/df_cases.pkl", print_infos=False):
path2corpus="./data/bow_prepro_desc.pkl",
df_cases_path="./data/df_cases.pkl", print_infos=False):
'''
- sorts out outliear-documents (which belongs to cluster .1 or cluster Nonde)
- converts the pandas dataframe to datastes.Dataset type.
......@@ -89,11 +89,11 @@ def text_label_files_to_labeled_dataset(label_set,
def get_all_label_set_ids():
df = pd.read_pickle("./database/df_cases.pkl")
df = pd.read_pickle("./data/df_cases.pkl")
return [e[6:] for e in df.columns if "label_" in e]
def get_filename_label_tuple(label_set, get_micro_txt=True, df_cases_file="./database/df_cases.pkl"):
def get_filename_label_tuple(label_set, get_micro_txt=True, df_cases_file="./data/df_cases.pkl"):
'''
returns textfilename_list, label_lists as ([filenames],[labels, as int list]))
it will contain outlier labels (they have value None or -1)
......@@ -110,7 +110,7 @@ def get_filename_label_tuple(label_set, get_micro_txt=True, df_cases_file="./dat
return df_cases["diagnosis_text_files"], int_labels
def get_amount_unique_labels(label_set, df_cases_file="./database/df_cases.pkl"):
def get_amount_unique_labels(label_set, df_cases_file="./data/df_cases.pkl"):
'''
returns amount unique labels (does not count nan or -1 classes!!!).
If label_set does not exist, you will get
......@@ -146,13 +146,13 @@ def generate_save_hf_dataset(label_set="LDA", overwrite=True, lower=False):
'''
Generate a labeled dataset of type datasets.Dataset
(datasets is a library from huggingface)
and saves it under "./database/labeled_dataframes/labeld_dataset_" + label_set
and saves it under "./data/labeled_dataframes/labeld_dataset_" + label_set
'''
import datasets
import pyarrow as pa
dataset_path = "./database/labeled_dataframes/labeld_dataset_" + label_set
dataset_path = "./data/labeled_dataframes/labeld_dataset_" + label_set
if os.path.exists(dataset_path):
print(dataset_path + " already exists.")
......@@ -162,11 +162,11 @@ def generate_save_hf_dataset(label_set="LDA", overwrite=True, lower=False):
print("skipping generation of " + dataset_path)
return
df_cases = pd.read_pickle("./database/df_cases.pkl")
df_cases = pd.read_pickle("./data/df_cases.pkl")
# print(df_cases.columns)
# collect all text-label pairs, skipping unvalid labels!
diag_text_rokenized = pd.read_pickle("./database/diag_lst_tokenized.pkl")
diag_text_rokenized = pd.read_pickle("./data/diag_lst_tokenized.pkl")
texts = []
labels = []
diagnoses = []
......
# Topic modeling loop project
## installation
Create a new environment, then install the required python packages with:
```pip install -r requirements.txt```
## usage
... still in experimenting phase...
\ No newline at end of file
topic-wizard
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment