Commit 10d56795 authored by max-laptop's avatar max-laptop

improved pipelines for new topic modeling tasks

parent c7bab612
...@@ -18,13 +18,18 @@ Feel free to use and adapt the scripts to your own needs. ...@@ -18,13 +18,18 @@ Feel free to use and adapt the scripts to your own needs.
## Requirements ## Requirements
Create a new environment, then install the required python packages with: Create a new environment, then install the required python packages with:
```pip install -r requirements.txt``` ```pip install -r requirements.txt```
(so far tested with python 3.10)
The script ```database_preparation/preprocess.py``` requires some nltk corporas: The script ```database_preparation/preprocess.py``` requires some nltk corporas:
``` ```
import nltk import nltk
nltk.download('stopwords') nltk.download('stopwords')
nltk.download('punkt') nltk.download('punkt')
nltk.download('punkt_tab')
``` ```
\ No newline at end of file
...@@ -15,9 +15,9 @@ from sklearn.model_selection import KFold ...@@ -15,9 +15,9 @@ from sklearn.model_selection import KFold
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
import nltk import nltk
import pickle
import datasets import datasets
import pyarrow as pa import pyarrow as pa
import pickle
fold_amount = 10 fold_amount = 10
...@@ -102,6 +102,7 @@ def cross_validate_label_corpus_with_simple_SVM(labels, path2corpus = "./databas ...@@ -102,6 +102,7 @@ def cross_validate_label_corpus_with_simple_SVM(labels, path2corpus = "./databas
returns 10-fold-cross-validated accuracy value returns 10-fold-cross-validated accuracy value
""" """
texts = pd.read_pickle(path2corpus) texts = pd.read_pickle(path2corpus)
from database_preparation.utils_labeled_datasets import text_label_2_labeled_dataset from database_preparation.utils_labeled_datasets import text_label_2_labeled_dataset
......
...@@ -6,7 +6,7 @@ import os ...@@ -6,7 +6,7 @@ import os
import sys import sys
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix #from sklearn.metrics import ConfusionMatrixDisplay # plot_confusion_matrix # TODO: replace with ConfusionMatrixDisplay
import seaborn as sn import seaborn as sn
import pandas as pd import pandas as pd
import pickle import pickle
......
#%% argsparse section # argsparse section
import sys, os import sys, os
sys.path.append(os.getcwd()) sys.path.append(os.getcwd())
from TextClustering.argsparse_clustering_preamble import argsparse_preamble from TextClustering.argsparse_clustering_preamble import argsparse_preamble
args = argsparse_preamble() args = argsparse_preamble()
from database_preparation.utils_labeled_datasets import is_text_lst_tokenized, is_text_lst_tfidf_vectorized
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
...@@ -18,21 +16,28 @@ from nltk import RegexpTokenizer ...@@ -18,21 +16,28 @@ from nltk import RegexpTokenizer
from TextClustering.utils_metrics import ClusterMetrics from TextClustering.utils_metrics import ClusterMetrics
from database_preparation.preprocess import print_meta_data from database_preparation.preprocess import print_meta_data
from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
import json
tokenizer = RegexpTokenizer(r'\w+') tokenizer = RegexpTokenizer(r'\w+')
#%% load the data # load the data
with open(args.path2corpus, 'rb') as f: '''with open(args.path2corpus, 'rb') as f:
diag_lst = pickle.load(f) doc_list = pickle.load(f)'''
assert args.path2corpus[-4:] == '.pkl'
corpus_df = pd.read_pickle(args.path2corpus)
corpus_dict = {case_id: corpus_df.loc[corpus_df['case_id'] == case_id, 'preprocessed_text'].values[0] for case_id in corpus_df['case_id']}
doc_list = corpus_df['preprocessed_text'].tolist()
print_meta_data(args.path2corpus) #print_meta_data(args.path2corpus)
def identity(word): def identity(word):
return word return word
text_is_vectorized = is_text_lst_tfidf_vectorized(args.path2corpus) text_is_vectorized = False #is_text_lst_tfidf_vectorized(args.path2corpus)
if not (is_text_lst_tokenized(args.path2corpus) or text_is_vectorized): if not type(doc_list[0]) == list:
print("Error: " + args.path2corpus + ' has to be a list of tokenized texts or tfidf-vectorized texts!') print("Error: " + args.path2corpus + ' has to be a list of tokenized texts or tfidf-vectorized texts!')
exit(1) exit(1)
...@@ -42,11 +47,13 @@ def create_vectorizer(data): ...@@ -42,11 +47,13 @@ def create_vectorizer(data):
return vec return vec
if text_is_vectorized: if text_is_vectorized:
text_features = diag_lst text_features = doc_list
else: else:
text_features = create_vectorizer(diag_lst) text_features = create_vectorizer(doc_list)
#%% perform umap for dimension-reduction (for cluster-detection) print(f"tf-idf vectorized corpus generated {text_features.shape[0]} vectors of length {text_features.shape[1]}.")
# perform umap for dimension-reduction (for cluster-detection)
umap_text_features = umap.UMAP(n_neighbors=15, umap_text_features = umap.UMAP(n_neighbors=15,
n_components=5, n_components=5,
metric='cosine').fit_transform(text_features) metric='cosine').fit_transform(text_features)
...@@ -57,7 +64,7 @@ umap_text_features2D = umap.UMAP(n_neighbors=15, ...@@ -57,7 +64,7 @@ umap_text_features2D = umap.UMAP(n_neighbors=15,
min_dist=0.0, metric='cosine').fit_transform(text_features) min_dist=0.0, metric='cosine').fit_transform(text_features)
if args.find_k_value: if args.find_k_value:
# %% perform hdbscan for cluster-dectection with different cluster sizes to find a good solution...by eye.. # perform hdbscan for cluster-dectection with different cluster sizes to find a good solution...by eye..
list_cluster_size = [int(k) for k in np.arange(3, 23, 1)] list_cluster_size = [int(k) for k in np.arange(3, 23, 1)]
s_score, n_cluster, svm_scores = [], [], [] s_score, n_cluster, svm_scores = [], [], []
for i_cluster_size in list_cluster_size: for i_cluster_size in list_cluster_size:
...@@ -67,20 +74,20 @@ if args.find_k_value: ...@@ -67,20 +74,20 @@ if args.find_k_value:
cluster_selection_method='eom').fit(umap_text_features) cluster_selection_method='eom').fit(umap_text_features)
result = pd.DataFrame(umap_text_features2D, columns=['x', 'y']) df_clustering_result2d = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
result['labels'] = cluster.labels_.tolist() # cluster.labels_ df_clustering_result2d['labels'] = cluster.labels_.tolist() # cluster.labels_
print(np.unique(result.labels)) print(np.unique(df_clustering_result2d.labels))
#%% Visualize clusters # Visualize clusters
outliers = result.loc[result.labels == -1, :] outliers = df_clustering_result2d.loc[df_clustering_result2d.labels == -1, :]
clustered = result.loc[result.labels != -1, :] clustered = df_clustering_result2d.loc[df_clustering_result2d.labels != -1, :]
clustered['labels'] = [str(i) for i in clustered['labels']] clustered['labels'] = [str(i) for i in clustered['labels']]
evaluation = ClusterMetrics(umap_text_features, cluster.labels_.tolist()) evaluation = ClusterMetrics(umap_text_features, cluster.labels_.tolist())
s_score.append(evaluation.s_score) s_score.append(evaluation.s_score)
svm_scores.append( svm_scores.append(
cross_validate_label_corpus_with_simple_SVM(cluster.labels_.tolist(), args.path2corpus + '.pkl', cross_validate_label_corpus_with_simple_SVM(cluster.labels_.tolist(), args.path2corpus,
False)) False))
n_cluster.append(len(np.unique(cluster.labels_.tolist()))) n_cluster.append(len(np.unique(cluster.labels_.tolist())))
...@@ -103,30 +110,36 @@ if args.find_k_value: ...@@ -103,30 +110,36 @@ if args.find_k_value:
exit() exit()
#%% perform hdbscan with best cluster size # perform hdbscan with best cluster size
cluster = hdbscan.HDBSCAN(min_cluster_size=args.k_value, cluster = hdbscan.HDBSCAN(min_cluster_size=args.k_value,
metric='euclidean', metric='euclidean',
cluster_selection_method='eom').fit(umap_text_features) cluster_selection_method='eom').fit(umap_text_features)
result = pd.DataFrame(umap_text_features2D, columns=['x', 'y']) df_clustering_result2d = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
result['labels'] = cluster.labels_.tolist() # cluster.labels_ df_clustering_result2d['labels'] = cluster.labels_.tolist() # cluster.labels_
clusters = np.int8([str(i) for i in result['labels']]) clusters = np.int8([str(i) for i in df_clustering_result2d['labels']])
outliers = result.loc[result.labels == -1, :] outliers = df_clustering_result2d.loc[df_clustering_result2d.labels == -1, :]
clusters_no_outliers = result.loc[result.labels != -1, :] clusters_no_outliers = df_clustering_result2d.loc[df_clustering_result2d.labels != -1, :]
unique_clusters = np.unique(result.labels) unique_clusters = np.unique(df_clustering_result2d.labels)
print(f"\nfound {len(unique_clusters[unique_clusters>-1])} clusters.\n")
# save umaped vectors: # save umaped vectors:
df = pd.read_pickle(args.df_cases_file) #df = pd.read_pickle(args.df_cases_file)
df['umapX_HDBSCAN'] = umap_text_features2D[:, 0] corpus_df['umapX_HDBSCAN'] = umap_text_features2D[:, 0]
df['umapY_HDBSCAN'] = umap_text_features2D[:, 1] corpus_df['umapY_HDBSCAN'] = umap_text_features2D[:, 1]
df['label_HDBSCAN'] = clusters corpus_df['label_HDBSCAN'] = clusters
df.to_pickle(args.df_cases_file) corpus_df.to_pickle(args.path2corpus)
corpus_df.to_csv(args.path2corpus.replace('.pkl', '.csv'))
#corpus_df.to_excel(args.path2corpus.replace('.pkl', '.xlsx'))
#%% and evaluate the results with several metrics (not needing ground truth)
evaluation = ClusterMetrics(umap_text_features[result.labels >= 0,], clusters_no_outliers.labels.tolist(), print(f"updated {args.path2corpus} with umapX_HDBSCAN, umapY_HDBSCAN and label_HDBSCAN.")
file_name= "TextClustering/cluster_metrics/HDBSCAN_metrics.pkl") print(corpus_df.head())
print(f"cluster count: {len(unique_clusters)}")
print(f"outliers: {len(outliers)}")
# and evaluate the results with several metrics (not needing ground truth)
evaluation = ClusterMetrics(umap_text_features[df_clustering_result2d.labels >= 0,], clusters_no_outliers.labels.tolist(),
file_name=args.path2corpus.replace('.pkl', '').replace('.json', '').replace('.df', '') + '_HDBSCAN_metrics')
evaluation.write_to_file() evaluation.write_to_file()
#%% argsparse section # argsparse section
import sys, os import sys, os
sys.path.append(os.getcwd()) sys.path.append(os.getcwd())
...@@ -13,7 +13,7 @@ if not is_text_lst_tokenized(args.path2corpus): ...@@ -13,7 +13,7 @@ if not is_text_lst_tokenized(args.path2corpus):
'Please pass texts list where each text is tokenized (a list of words).') 'Please pass texts list where each text is tokenized (a list of words).')
exit(1) exit(1)
#%% import section # import section
import pickle import pickle
import gensim import gensim
import gensim.corpora as corpora import gensim.corpora as corpora
...@@ -26,20 +26,20 @@ from tqdm import tqdm ...@@ -26,20 +26,20 @@ from tqdm import tqdm
from database_preparation.preprocess import print_meta_data from database_preparation.preprocess import print_meta_data
from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
#%% load the diag and main_diag list # load the diag and main_diag list
with open(args.path2corpus, 'rb') as f: with open(args.path2corpus, 'rb') as f:
diag_lst = pickle.load(f) diag_lst = pickle.load(f)
print_meta_data(args.path2corpus) print_meta_data(args.path2corpus)
#%% prepare database_preparation for LDA-model-trainng # prepare database_preparation for LDA-model-trainng
# Creates, which is a mapping of word IDs to words. # Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(diag_lst) words = corpora.Dictionary(diag_lst)
# Turns each document into a bag of words. # Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in diag_lst] #is that allready a model corpus = [words.doc2bow(doc) for doc in diag_lst] #is that allready a model
#%% train LDA-model with different number of clusters # train LDA-model with different number of clusters
if args.find_k_value: if args.find_k_value:
limit=21; start=5; step=1 limit=21; start=5; step=1
coherence_values = [] coherence_values = []
...@@ -72,7 +72,7 @@ if args.find_k_value: ...@@ -72,7 +72,7 @@ if args.find_k_value:
n_cluster.append(len(np.unique(np.asarray(predictedCluster)))) n_cluster.append(len(np.unique(np.asarray(predictedCluster))))
print("coherence: " + str(coherencemodel.get_coherence())) print("coherence: " + str(coherencemodel.get_coherence()))
#%% visualize the results # visualize the results
x = range(start, limit, step) x = range(start, limit, step)
fig, ax1 = plt.subplots() fig, ax1 = plt.subplots()
ax2 = ax1.twinx() ax2 = ax1.twinx()
...@@ -92,7 +92,7 @@ if args.find_k_value: ...@@ -92,7 +92,7 @@ if args.find_k_value:
plt.show() plt.show()
exit() exit()
#%% train LDA-model # train LDA-model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=words, id2word=words,
num_topics=args.k_value, num_topics=args.k_value,
...@@ -103,7 +103,7 @@ lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, ...@@ -103,7 +103,7 @@ lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
per_word_topics=True) per_word_topics=True)
#%% get topic weights / features # get topic weights / features
topic_weights = [] topic_weights = []
for i, row_list in enumerate(lda_model[corpus]): for i, row_list in enumerate(lda_model[corpus]):
topic_weights.append([w for i, w in row_list[0]]) topic_weights.append([w for i, w in row_list[0]])
...@@ -111,7 +111,7 @@ for i, row_list in enumerate(lda_model[corpus]): ...@@ -111,7 +111,7 @@ for i, row_list in enumerate(lda_model[corpus]):
# Array of topic weights # Array of topic weights
text_features = pd.DataFrame(topic_weights).fillna(0).values text_features = pd.DataFrame(topic_weights).fillna(0).values
#%% get prediction # get prediction
predictedCluster= np.argmax(text_features, axis=1) predictedCluster= np.argmax(text_features, axis=1)
# and add it to the dataframe # and add it to the dataframe
...@@ -126,7 +126,7 @@ df['pcaX_LDA'] = reduced_features[:, 0] ...@@ -126,7 +126,7 @@ df['pcaX_LDA'] = reduced_features[:, 0]
df['pcaY_LDA'] = reduced_features[:, 1] df['pcaY_LDA'] = reduced_features[:, 1]
#%% and with umap # and with umap
import umap import umap
umap_text_features2D = umap.UMAP(n_neighbors=15, umap_text_features2D = umap.UMAP(n_neighbors=15,
n_components=2, n_components=2,
...@@ -136,7 +136,7 @@ df['umapX_LDA'] = umap_text_features2D[:, 0] ...@@ -136,7 +136,7 @@ df['umapX_LDA'] = umap_text_features2D[:, 0]
df['umapY_LDA'] = umap_text_features2D[:, 1] df['umapY_LDA'] = umap_text_features2D[:, 1]
df.to_pickle(args.df_cases_file) df.to_pickle(args.df_cases_file)
#%% evalute the model # evalute the model
from TextClustering.utils_metrics import ClusterMetrics from TextClustering.utils_metrics import ClusterMetrics
evaluation = ClusterMetrics(text_features, predictedCluster, evaluation = ClusterMetrics(text_features, predictedCluster,
file_name= "TextClustering/cluster_metrics/LDA_metrics.pkl") file_name= "TextClustering/cluster_metrics/LDA_metrics.pkl")
......
...@@ -6,26 +6,25 @@ import pickle ...@@ -6,26 +6,25 @@ import pickle
import openpyxl import openpyxl
# parameters: # parameters:
df_cases_file = "database/df_cases.pkl" df_cases_file = "database/bow_short_diag/bow_short_diag.df.pkl"
print_latex = False print_latex = False
filter_stop_words = True filter_stop_words = True
path2umap_pics = 'TextClustering/plots/UMAP/' path2umap_pics = 'TextClustering/plots/UMAP/'
save_umap_picture_in_table = True save_umap_picture_in_table = True
path2corpus_bow_preprocessed = 'database/bow_prepro_diag.pkl' stopword_filtered_corpus = True
path2corpus_embedding_preprocessed = 'database/embedding_prepro_diag.pkl'
####### functions ########## ####### functions ##########
def main(): def main():
cluster_sets = ['KMeans', 'LDA', 'HDBSCAN', 'German_BERT', 'Patho_BERT', 'top2vec', 'GSDPMM'] cluster_sets = ['HDBSCAN'] #['HDBSCAN', 'German_BERT', 'Patho_BERT', 'top2vec', 'GSDPMM']
# cluster_sets = ['German_BERT'] # cluster_sets = ['German_BERT']
df_cases = pd.read_pickle(df_cases_file) df_cases = pd.read_pickle(df_cases_file)
for cluster_set in cluster_sets: for cluster_set in cluster_sets:
# re-generate the topic words: # re-generate the topic words:
excel_file_path = 'TextClustering/tables/WordsPerCluster_' + cluster_set + '.xlsx' excel_file_path = df_cases_file.replace('.df.pkl', f'_{cluster_set}_topwords.xlsx') #'TextClustering/tables/WordsPerCluster_' + cluster_set + '.xlsx'
# convert nan-values in int(-1): # convert nan-values in int(-1):
try: try:
...@@ -34,20 +33,12 @@ def main(): ...@@ -34,20 +33,12 @@ def main():
print(f"skipping {cluster_set}. it is not in the df_cases_file.") print(f"skipping {cluster_set}. it is not in the df_cases_file.")
continue continue
if cluster_set in ['German_BERT', 'Patho_BERT', 'top2vec']: report_list = df_cases['preprocessed_text'].tolist()
text_corpus_path = path2corpus_embedding_preprocessed
else:
text_corpus_path = path2corpus_bow_preprocessed
meta_params = get_metadata(text_corpus_path)
with open(text_corpus_path, 'rb') as f:
diag_lst = pickle.load(f)
# do not apply stopwordfilterg if it is already stopword filtered! # do not apply stopwordfilterg if it is already stopword filtered!
generate_save_topicwords(clusters, diag_lst, save_excel_file_path=excel_file_path, generate_save_topicwords(clusters, report_list, save_excel_file_path=excel_file_path,
n_words=10, print_latex_table=print_latex, n_words=10, print_latex_table=print_latex,
filter_stop_words=filter_stop_words and not meta_params['stopword_filtered']) filter_stop_words=filter_stop_words and not stopword_filtered_corpus)
if save_umap_picture_in_table: if save_umap_picture_in_table:
pic_path = path2umap_pics + cluster_set + "_UMAP.png" pic_path = path2umap_pics + cluster_set + "_UMAP.png"
try: try:
...@@ -66,14 +57,14 @@ def main(): ...@@ -66,14 +57,14 @@ def main():
######### topic words of authors ######### ######### topic words of authors #########
clusters = label_list_as_int_list(df_cases['label_author']) ''' clusters = label_list_as_int_list(df_cases['label_author'])
excel_file_path = 'TextClustering/tables/WordsPerCluster_authors.xlsx' excel_file_path = 'TextClustering/tables/WordsPerCluster_authors.xlsx'
with open(path2corpus_bow_preprocessed, 'rb') as f: with open(path2corpus_bow_preprocessed, 'rb') as f:
diag_lst = pickle.load(f) report_list = pickle.load(f)
generate_save_topicwords(clusters, diag_lst, save_excel_file_path=excel_file_path, generate_save_topicwords(clusters, report_list, save_excel_file_path=excel_file_path,
n_words=20, print_latex_table=print_latex, n_words=20, print_latex_table=print_latex,
filter_stop_words=False) filter_stop_words=False)'''
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -5,11 +5,13 @@ from database_preparation.utils_labeled_datasets import label_list_as_int_list ...@@ -5,11 +5,13 @@ from database_preparation.utils_labeled_datasets import label_list_as_int_list
clustersets = ["GSDPMM", "KMeans", "LDA", "HDBSCAN", clustersets = ["GSDPMM", "KMeans", "LDA", "HDBSCAN",
"top2vec", "Patho_BERT", "German_BERT"] "top2vec", "Patho_BERT", "German_BERT"]
clustersets = ["HDBSCAN"]
plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)", plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)",
"LDA (UMAP representation)", "HDBSCAN (UMAP representation)", "LDA (UMAP representation)", "HDBSCAN (UMAP representation)",
"top2vec (UMAP representation)", "Patho-BERT (UMAP representation)", "top2vec (UMAP representation)", "Patho-BERT (UMAP representation)",
"German-BERT (UMAP representation)"] "German-BERT (UMAP representation)"]
df_cases_file = "database/df_cases.pkl" plot_titles = ["HDBSCAN (UMAP representation)"]
df_cases_file = "database/bow_short_diag/bow_short_diag.df.pkl"
def save_umap_plot(clustersetname, df, title=None): def save_umap_plot(clustersetname, df, title=None):
if not 'label_' + clustersetname in df: if not 'label_' + clustersetname in df:
...@@ -26,7 +28,7 @@ def save_umap_plot(clustersetname, df, title=None): ...@@ -26,7 +28,7 @@ def save_umap_plot(clustersetname, df, title=None):
umap_text_features2D = np.transpose(umap_text_features2D) umap_text_features2D = np.transpose(umap_text_features2D)
cluster_scatter_plot(umap_text_features2D, predictedCluster_text_features, cluster_scatter_plot(umap_text_features2D, predictedCluster_text_features,
"TextClustering/plots/UMAP/" + clustersetname + "_UMAP.png", df_cases_file.replace('.df.pkl', '_') + clustersetname + "_UMAP.png",
show_plot=False, colorblindfriendly=False, fig_title=title) show_plot=False, colorblindfriendly=False, fig_title=title)
if 'label_author' in df: if 'label_author' in df:
author_labels = df["label_author"] author_labels = df["label_author"]
......
from sklearn.feature_extraction.text import CountVectorizer
import joblib
import topicwizard
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline
'''
isntallation:
pip install topic-wizard
'''
if __name__ == '__main__':
vectorizer = CountVectorizer(min_df=5, max_df=0.8, stop_words="english")
model = NMF(n_components=10)
topic_pipeline = make_pipeline(vectorizer, model)
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset="all")
corpus = newsgroups.data
# Sklearn gives the labels back as integers, we have to map them back to
# the actual textual label.
group_labels = [newsgroups.target_names[label] for label in newsgroups.target]
topic_pipeline.fit(corpus)
print("launching topicwizard visualizer")
topicwizard.visualize(corpus, model=topic_pipeline)
exit(0)
\ No newline at end of file
import openpyxl import openpyxl
from TextClustering.utils_wordlist import get_top_cluster_words_as_latex_table from TextClustering.utils_wordlist import get_top_cluster_words_as_latex_table
from googletrans import Translator # use pip install googletrans==3.1.0a0, 3.0 version is broken
from utils_general import custom_translation
path2table = "WordsPerCluster_HDBSCAN.xlsx" path2table = "WordsPerCluster_HDBSCAN.xlsx"
...@@ -12,6 +12,13 @@ black = '1' ...@@ -12,6 +12,13 @@ black = '1'
latex_weak_word = '\\weakcolor' latex_weak_word = '\\weakcolor'
latex_strong_word = '\\strongcolor' latex_strong_word = '\\strongcolor'
translate_to_eng = False
if translate_to_eng:
from googletrans import Translator # use pip install googletrans==3.1.0a0, 3.0 version is broken
from utils_general import custom_translation
def color2latex_color(color): def color2latex_color(color):
if color == green: if color == green:
...@@ -49,9 +56,9 @@ def main(): ...@@ -49,9 +56,9 @@ def main():
extraction_methods = ['tf-idf', 'SVM'] extraction_methods = ['tf-idf', 'SVM']
cluster_method = 'HDBSCAN' cluster_method = 'HDBSCAN'
anotate_svm_as_tfidf = True anotate_svm_as_tfidf = True
print_also_translated_tables = True
translator = Translator() if translate_to_eng:
translator = Translator()
words_list_tfidf = [] words_list_tfidf = []
colorstfidf = [] colorstfidf = []
...@@ -82,7 +89,7 @@ def main(): ...@@ -82,7 +89,7 @@ def main():
print(latex) print(latex)
# print english topic words: # print english topic words:
if print_also_translated_tables: if translate_to_eng:
description = f'Annotated topic words (translated from German to English), ' \ description = f'Annotated topic words (translated from German to English), ' \
f'extracted from the {cluster_method} cluster-set, ' \ f'extracted from the {cluster_method} cluster-set, ' \
f'using the {extraction_method} based extraction method.' f'using the {extraction_method} based extraction method.'
......
...@@ -63,7 +63,7 @@ class ClusterMetrics: ...@@ -63,7 +63,7 @@ class ClusterMetrics:
df = pd.DataFrame(results, df = pd.DataFrame(results,
index =['s-score', 'ch-index', 'db-score', 'cop', 'dunn-score', 'entropy', 'svm-accuracy'], index =['s-score', 'ch-index', 'db-score', 'cop', 'dunn-score', 'entropy', 'svm-accuracy'],
columns =[tail]) columns =[tail])
df.to_pickle(self.file_name) df.to_csv(self.file_name.replace('.pkl', '') + ".csv")
print(df) print(df)
......
...@@ -278,6 +278,16 @@ def get_nwordlist(text_lst, cluster_lst, n_words=10, ...@@ -278,6 +278,16 @@ def get_nwordlist(text_lst, cluster_lst, n_words=10,
def save_topwordlist_as_excel(file_path, token_list, sheet_name): def save_topwordlist_as_excel(file_path, token_list, sheet_name):
if bool(file_path): if bool(file_path):
token_list_dict = {}
for i_cluster, token_list in enumerate(token_list):
for i_top, token in enumerate(token_list):
key = f"top {i_top + 1}"
if key not in token_list_dict:
token_list_dict[key] = []
token_list_dict[key].append(token)
pandas.DataFrame(token_list_dict).to_csv(file_path.replace('.xlsx', '.csv'), index_label='cluster')
return
try: # if excelfile does exist, append new sheet to workbook: try: # if excelfile does exist, append new sheet to workbook:
excel_book = pxl.load_workbook(file_path) excel_book = pxl.load_workbook(file_path)
if sheet_name in excel_book.get_sheet_names(): if sheet_name in excel_book.get_sheet_names():
...@@ -296,6 +306,8 @@ def save_topwordlist_as_excel(file_path, token_list, sheet_name): ...@@ -296,6 +306,8 @@ def save_topwordlist_as_excel(file_path, token_list, sheet_name):
pandas.DataFrame(token_list).to_excel(file_path, sheet_name=sheet_name) pandas.DataFrame(token_list).to_excel(file_path, sheet_name=sheet_name)
def generate_save_topicwords(predictedClusters, text_lst, save_excel_file_path, def generate_save_topicwords(predictedClusters, text_lst, save_excel_file_path,
n_words=10, print_latex_table=False, n_words=10, print_latex_table=False,
extraction_methods=['TFIDF-based', 'frequency-based', 'yake', 'svm-based'], extraction_methods=['TFIDF-based', 'frequency-based', 'yake', 'svm-based'],
......
{"source_data": "./database/diagnosis", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
{
"source_data": {"path_to_dataset": "database/nephro_reports_sectiondivided.newest", "sections": ["conclusion"]},
"tokenized": true,
"cased": false,
"stopword_filtered": true,
"use_combiner": true,
"use_replacer": true,
"lemma_mode": 3,
"punct_mode": 2,
"number_mode": 3
}
\ No newline at end of file
...@@ -12,7 +12,12 @@ from database_preparation.utils_wordbase import RegexpReplacer, RegexpSynonyms ...@@ -12,7 +12,12 @@ from database_preparation.utils_wordbase import RegexpReplacer, RegexpSynonyms
from database_preparation.stop_word_list import filter_stopwords from database_preparation.stop_word_list import filter_stopwords
import json import json
import argparse import argparse
import pandas as pd
'''
# installed: nltk, Hanta, tqdm, numpy
'''
########## define enums ########## ########## define enums ##########
...@@ -118,7 +123,8 @@ def preprocess(parameter_dict): ...@@ -118,7 +123,8 @@ def preprocess(parameter_dict):
Histo numbers and dates will always be removed! Histo numbers and dates will always be removed!
""" """
source_data_path = parameter_dict['source_data'] source_data_path = parameter_dict['source_data']['path_to_dataset']
sections_to_preprocess = parameter_dict['source_data']['sections']
do_tokenize = parameter_dict['tokenized'] do_tokenize = parameter_dict['tokenized']
cased = parameter_dict['cased'] cased = parameter_dict['cased']
stopword_filtered = parameter_dict['stopword_filtered'] stopword_filtered = parameter_dict['stopword_filtered']
...@@ -128,7 +134,6 @@ def preprocess(parameter_dict): ...@@ -128,7 +134,6 @@ def preprocess(parameter_dict):
punct_mode = parameter_dict['punct_mode'] punct_mode = parameter_dict['punct_mode']
number_mode = parameter_dict['number_mode'] number_mode = parameter_dict['number_mode']
lemma_mode = LemmatizeMode(lemma_mode) lemma_mode = LemmatizeMode(lemma_mode)
punct_mode = PunctuationMode(punct_mode) punct_mode = PunctuationMode(punct_mode)
number_mode = NumberMode(number_mode) number_mode = NumberMode(number_mode)
...@@ -137,17 +142,34 @@ def preprocess(parameter_dict): ...@@ -137,17 +142,34 @@ def preprocess(parameter_dict):
replacer = RegexpReplacer() replacer = RegexpReplacer()
tagger = ht.HanoverTagger('morphmodel_ger.pgz') tagger = ht.HanoverTagger('morphmodel_ger.pgz')
file_list = glob.glob(source_data_path + '/*.txt') # load the files in a sorted way:
file_list = sorted(file_list, key=lambda f: int(f[f.find("#") + 1:-4])) file_list = glob.glob(source_data_path + '/*.json')
file_list.sort()
preprocessed_corpus = [] preprocessed_corpus = {}
corpus = {}
random_example_idx = random.randrange(min(10, len(file_list))) random_example_idx = random.randrange(min(10, len(file_list)))
for idx, t_file in tqdm(enumerate(file_list)): for idx, t_file in tqdm(enumerate(file_list)):
# %% load the txt-file # load the txt-file
t_text = read_german_text(t_file) # t_text = read_german_text(t_file)
case_id = t_file.split('/')[-1].replace('.json', '')
# load the json-file
with open(t_file) as json_file:
report = json.load(json_file)
t_text = ""
for section in sections_to_preprocess:
if section in report:
if report[section]:
t_text += '\n' + report[section]
if not t_text:
# print(f"Skipping file {t_file} because it does not contain any of the sections: {sections_to_preprocess}")
continue
original_text = t_text original_text = t_text
...@@ -169,7 +191,7 @@ def preprocess(parameter_dict): ...@@ -169,7 +191,7 @@ def preprocess(parameter_dict):
# lemmatize / stemming # lemmatize / stemming
t_text = tagger.tag_sent(t_text) t_text = tagger.tag_sent(t_text)
# %% lemmarize the text # lemmarize the text
if lemma_mode == LemmatizeMode.lemma_only_nouns: if lemma_mode == LemmatizeMode.lemma_only_nouns:
t_text = [lemma for (word, lemma, pos) in t_text if pos == "NN" or pos == "NE"] t_text = [lemma for (word, lemma, pos) in t_text if pos == "NN" or pos == "NE"]
elif lemma_mode == LemmatizeMode.lemma_only_nouns_adja: elif lemma_mode == LemmatizeMode.lemma_only_nouns_adja:
...@@ -186,13 +208,13 @@ def preprocess(parameter_dict): ...@@ -186,13 +208,13 @@ def preprocess(parameter_dict):
else: # none else: # none
t_text = [word for (word, lemma, pos) in t_text] t_text = [word for (word, lemma, pos) in t_text]
# %% filter punctuation: # filter punctuation:
if punct_mode == PunctuationMode.remove: if punct_mode == PunctuationMode.remove:
t_text = [token for token in t_text if token not in punctuations_to_remove] t_text = [token for token in t_text if token not in punctuations_to_remove]
elif punct_mode == PunctuationMode.replace: elif punct_mode == PunctuationMode.replace:
t_text = [token if token not in punctuations_to_remove else punct_replace_symbol for token in t_text] t_text = [token if token not in punctuations_to_remove else punct_replace_symbol for token in t_text]
# %% number filtering # number filtering
filtered_text = [] filtered_text = []
use_single_symbol = True use_single_symbol = True
for i, word in enumerate(t_text): for i, word in enumerate(t_text):
...@@ -280,8 +302,9 @@ def preprocess(parameter_dict): ...@@ -280,8 +302,9 @@ def preprocess(parameter_dict):
if not do_tokenize: if not do_tokenize:
t_text = ' '.join(t_text) t_text = ' '.join(t_text)
# %% add to the list # add to the list
preprocessed_corpus.append(t_text) preprocessed_corpus[case_id] = t_text
corpus[case_id] = original_text
if idx == random_example_idx: if idx == random_example_idx:
print("-------------- Preprocessing Example: ---------------") print("-------------- Preprocessing Example: ---------------")
...@@ -291,7 +314,7 @@ def preprocess(parameter_dict): ...@@ -291,7 +314,7 @@ def preprocess(parameter_dict):
print(t_text) print(t_text)
print("-----------------------------\n") print("-----------------------------\n")
return preprocessed_corpus return preprocessed_corpus, corpus
def main(): def main():
...@@ -300,9 +323,9 @@ def main(): ...@@ -300,9 +323,9 @@ def main():
sys.path.append(os.getcwd()) sys.path.append(os.getcwd())
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--path_to_preprocessing_params", parser.add_argument("--path_to_preprocessing_params",
default='database/bow_prepro_diag_meta.json') default='database/preprocessed_reports/bow_prepro_diag_config.json')
parser.add_argument("--target_path", parser.add_argument("--target_path",
default='database/bow_prepro_diag.pkl') default=None)
args = parser.parse_args() args = parser.parse_args()
with open(args.path_to_preprocessing_params) as json_file: with open(args.path_to_preprocessing_params) as json_file:
...@@ -312,12 +335,30 @@ def main(): ...@@ -312,12 +335,30 @@ def main():
print(prepro_params_2_string(params)) print(prepro_params_2_string(params))
print() print()
preprocessed_corpus = preprocess(params) preprocessed_corpus_dict, corpus_dict = preprocess(params)
#with open(args.target_path, 'wb') as f:
# pickle.dump(preprocessed_corpus_dict, f)
if args.target_path is None:
args.target_path = args.path_to_preprocessing_params.replace('_config', '').replace('config', '')
print(f"saved preprocessed corpus at {args.target_path}, containing {len(preprocessed_corpus_dict)} reports.")
print(f"Voctabulary size: {len(set([word for c_id in preprocessed_corpus_dict.keys() for word in preprocessed_corpus_dict[c_id]]))}")
with open(args.target_path, 'wb') as f: corpus_as_table = {
pickle.dump(preprocessed_corpus, f) 'case_id': [c_id for c_id in preprocessed_corpus_dict.keys()],
'text': [corpus_dict[c_id] for c_id in corpus_dict.keys()],
'preprocessed_text': [preprocessed_corpus_dict[c_id] for c_id in preprocessed_corpus_dict.keys()]
}
print(f"saved preprocessed corpus at {args.target_path}") # store results:
df = pd.DataFrame(corpus_as_table)
df.to_pickle(args.target_path.replace('.json', '.df.pkl'))
df.to_csv(args.target_path.replace('.json', '.df.csv'))
#with open(args.target_path, "w") as json_file:
# json.dump(preprocessed_corpus_dict, json_file, indent=4)
'''print(get_corpus_stats("../DataNephroTexts/description")) '''print(get_corpus_stats("../DataNephroTexts/description"))
print(get_corpus_stats("../DataNephroTexts/diagnosis")) print(get_corpus_stats("../DataNephroTexts/diagnosis"))
......
...@@ -5,8 +5,6 @@ import os ...@@ -5,8 +5,6 @@ import os
import sys import sys
sys.path.append(os.getcwd()) sys.path.append(os.getcwd())
import datasets
import pyarrow as pa
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
from sklearn.model_selection import KFold from sklearn.model_selection import KFold
...@@ -44,6 +42,9 @@ def text_label_2_labeled_dataset(texts, unfiltered_labels, print_infos=False): ...@@ -44,6 +42,9 @@ def text_label_2_labeled_dataset(texts, unfiltered_labels, print_infos=False):
- converts the passed text-label pair to datastes.Dataset type. - converts the passed text-label pair to datastes.Dataset type.
- returns dataset in format: {"text": labeled_texts, "label": labels} - returns dataset in format: {"text": labeled_texts, "label": labels}
''' '''
import datasets
import pyarrow as pa
# collect all text-label pairs, skipping unvalid labels # collect all text-label pairs, skipping unvalid labels
labeled_texts = [] labeled_texts = []
labels = [] labels = []
...@@ -148,6 +149,9 @@ def generate_save_hf_dataset(label_set="LDA", overwrite=True, lower=False): ...@@ -148,6 +149,9 @@ def generate_save_hf_dataset(label_set="LDA", overwrite=True, lower=False):
and saves it under "./database/labeled_dataframes/labeld_dataset_" + label_set and saves it under "./database/labeled_dataframes/labeld_dataset_" + label_set
''' '''
import datasets
import pyarrow as pa
dataset_path = "./database/labeled_dataframes/labeld_dataset_" + label_set dataset_path = "./database/labeled_dataframes/labeld_dataset_" + label_set
if os.path.exists(dataset_path): if os.path.exists(dataset_path):
......
numpy==1.21.0 nltk # for preprocessing
gensim==4.2.0 Hanta
pandas==1.4.2 tqdm
matplotlib==3.5.1 numpy
tqdm==4.64.0 scikit-learn # for clustering
scikit-learn==1.1.1 matplotlib
hdbscan==0.8.28 pandas
nltk==3.7 umap-learn
seaborn==0.11.2 hdbscan
validclust==0.1.1 validclust
tensorflow-gpu==2.6.0 pyarrow # for saving and evaluating data
wordcloud==1.8.2.2 datasets
joblib==1.1.0 pyldavis # for ui-supported topic modeling analysis
scipy==1.7.3 openpyxl
yake==0.4.8 yake # for topicword search
openpyxl==3.0.10 \ No newline at end of file
googletrans==3.1.0a0
datasets==2.3.2
transformers==4.21.0.dev0
dataclasses==0.8
pyarrow==8.0.0
keras==2.6.0
torch==1.11.0
hanta==0.2.0
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment