Commit 10d56795 authored by max-laptop's avatar max-laptop

improved pipelines for new topic modeling tasks

parent c7bab612
......@@ -18,13 +18,18 @@ Feel free to use and adapt the scripts to your own needs.
## Requirements
Create a new environment, then install the required python packages with:
```pip install -r requirements.txt```
(so far tested with python 3.10)
The script ```database_preparation/preprocess.py``` requires some nltk corporas:
```
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
```
\ No newline at end of file
......@@ -15,9 +15,9 @@ from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import nltk
import pickle
import datasets
import pyarrow as pa
import pickle
fold_amount = 10
......@@ -102,6 +102,7 @@ def cross_validate_label_corpus_with_simple_SVM(labels, path2corpus = "./databas
returns 10-fold-cross-validated accuracy value
"""
texts = pd.read_pickle(path2corpus)
from database_preparation.utils_labeled_datasets import text_label_2_labeled_dataset
......
......@@ -6,7 +6,7 @@ import os
import sys
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
#from sklearn.metrics import ConfusionMatrixDisplay # plot_confusion_matrix # TODO: replace with ConfusionMatrixDisplay
import seaborn as sn
import pandas as pd
import pickle
......
#%% argsparse section
# argsparse section
import sys, os
sys.path.append(os.getcwd())
from TextClustering.argsparse_clustering_preamble import argsparse_preamble
args = argsparse_preamble()
from database_preparation.utils_labeled_datasets import is_text_lst_tokenized, is_text_lst_tfidf_vectorized
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
......@@ -18,21 +16,28 @@ from nltk import RegexpTokenizer
from TextClustering.utils_metrics import ClusterMetrics
from database_preparation.preprocess import print_meta_data
from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
import json
tokenizer = RegexpTokenizer(r'\w+')
#%% load the data
with open(args.path2corpus, 'rb') as f:
diag_lst = pickle.load(f)
# load the data
'''with open(args.path2corpus, 'rb') as f:
doc_list = pickle.load(f)'''
assert args.path2corpus[-4:] == '.pkl'
corpus_df = pd.read_pickle(args.path2corpus)
corpus_dict = {case_id: corpus_df.loc[corpus_df['case_id'] == case_id, 'preprocessed_text'].values[0] for case_id in corpus_df['case_id']}
doc_list = corpus_df['preprocessed_text'].tolist()
print_meta_data(args.path2corpus)
#print_meta_data(args.path2corpus)
def identity(word):
return word
text_is_vectorized = is_text_lst_tfidf_vectorized(args.path2corpus)
if not (is_text_lst_tokenized(args.path2corpus) or text_is_vectorized):
text_is_vectorized = False #is_text_lst_tfidf_vectorized(args.path2corpus)
if not type(doc_list[0]) == list:
print("Error: " + args.path2corpus + ' has to be a list of tokenized texts or tfidf-vectorized texts!')
exit(1)
......@@ -42,11 +47,13 @@ def create_vectorizer(data):
return vec
if text_is_vectorized:
text_features = diag_lst
text_features = doc_list
else:
text_features = create_vectorizer(diag_lst)
text_features = create_vectorizer(doc_list)
#%% perform umap for dimension-reduction (for cluster-detection)
print(f"tf-idf vectorized corpus generated {text_features.shape[0]} vectors of length {text_features.shape[1]}.")
# perform umap for dimension-reduction (for cluster-detection)
umap_text_features = umap.UMAP(n_neighbors=15,
n_components=5,
metric='cosine').fit_transform(text_features)
......@@ -57,7 +64,7 @@ umap_text_features2D = umap.UMAP(n_neighbors=15,
min_dist=0.0, metric='cosine').fit_transform(text_features)
if args.find_k_value:
# %% perform hdbscan for cluster-dectection with different cluster sizes to find a good solution...by eye..
# perform hdbscan for cluster-dectection with different cluster sizes to find a good solution...by eye..
list_cluster_size = [int(k) for k in np.arange(3, 23, 1)]
s_score, n_cluster, svm_scores = [], [], []
for i_cluster_size in list_cluster_size:
......@@ -67,20 +74,20 @@ if args.find_k_value:
cluster_selection_method='eom').fit(umap_text_features)
result = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
result['labels'] = cluster.labels_.tolist() # cluster.labels_
df_clustering_result2d = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
df_clustering_result2d['labels'] = cluster.labels_.tolist() # cluster.labels_
print(np.unique(result.labels))
print(np.unique(df_clustering_result2d.labels))
#%% Visualize clusters
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
# Visualize clusters
outliers = df_clustering_result2d.loc[df_clustering_result2d.labels == -1, :]
clustered = df_clustering_result2d.loc[df_clustering_result2d.labels != -1, :]
clustered['labels'] = [str(i) for i in clustered['labels']]
evaluation = ClusterMetrics(umap_text_features, cluster.labels_.tolist())
s_score.append(evaluation.s_score)
svm_scores.append(
cross_validate_label_corpus_with_simple_SVM(cluster.labels_.tolist(), args.path2corpus + '.pkl',
cross_validate_label_corpus_with_simple_SVM(cluster.labels_.tolist(), args.path2corpus,
False))
n_cluster.append(len(np.unique(cluster.labels_.tolist())))
......@@ -103,30 +110,36 @@ if args.find_k_value:
exit()
#%% perform hdbscan with best cluster size
# perform hdbscan with best cluster size
cluster = hdbscan.HDBSCAN(min_cluster_size=args.k_value,
metric='euclidean',
cluster_selection_method='eom').fit(umap_text_features)
result = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
result['labels'] = cluster.labels_.tolist() # cluster.labels_
clusters = np.int8([str(i) for i in result['labels']])
outliers = result.loc[result.labels == -1, :]
clusters_no_outliers = result.loc[result.labels != -1, :]
df_clustering_result2d = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
df_clustering_result2d['labels'] = cluster.labels_.tolist() # cluster.labels_
clusters = np.int8([str(i) for i in df_clustering_result2d['labels']])
outliers = df_clustering_result2d.loc[df_clustering_result2d.labels == -1, :]
clusters_no_outliers = df_clustering_result2d.loc[df_clustering_result2d.labels != -1, :]
unique_clusters = np.unique(result.labels)
print(f"\nfound {len(unique_clusters[unique_clusters>-1])} clusters.\n")
unique_clusters = np.unique(df_clustering_result2d.labels)
# save umaped vectors:
df = pd.read_pickle(args.df_cases_file)
df['umapX_HDBSCAN'] = umap_text_features2D[:, 0]
df['umapY_HDBSCAN'] = umap_text_features2D[:, 1]
df['label_HDBSCAN'] = clusters
df.to_pickle(args.df_cases_file)
#%% and evaluate the results with several metrics (not needing ground truth)
evaluation = ClusterMetrics(umap_text_features[result.labels >= 0,], clusters_no_outliers.labels.tolist(),
file_name= "TextClustering/cluster_metrics/HDBSCAN_metrics.pkl")
#df = pd.read_pickle(args.df_cases_file)
corpus_df['umapX_HDBSCAN'] = umap_text_features2D[:, 0]
corpus_df['umapY_HDBSCAN'] = umap_text_features2D[:, 1]
corpus_df['label_HDBSCAN'] = clusters
corpus_df.to_pickle(args.path2corpus)
corpus_df.to_csv(args.path2corpus.replace('.pkl', '.csv'))
#corpus_df.to_excel(args.path2corpus.replace('.pkl', '.xlsx'))
print(f"updated {args.path2corpus} with umapX_HDBSCAN, umapY_HDBSCAN and label_HDBSCAN.")
print(corpus_df.head())
print(f"cluster count: {len(unique_clusters)}")
print(f"outliers: {len(outliers)}")
# and evaluate the results with several metrics (not needing ground truth)
evaluation = ClusterMetrics(umap_text_features[df_clustering_result2d.labels >= 0,], clusters_no_outliers.labels.tolist(),
file_name=args.path2corpus.replace('.pkl', '').replace('.json', '').replace('.df', '') + '_HDBSCAN_metrics')
evaluation.write_to_file()
#%% argsparse section
# argsparse section
import sys, os
sys.path.append(os.getcwd())
......@@ -13,7 +13,7 @@ if not is_text_lst_tokenized(args.path2corpus):
'Please pass texts list where each text is tokenized (a list of words).')
exit(1)
#%% import section
# import section
import pickle
import gensim
import gensim.corpora as corpora
......@@ -26,20 +26,20 @@ from tqdm import tqdm
from database_preparation.preprocess import print_meta_data
from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
#%% load the diag and main_diag list
# load the diag and main_diag list
with open(args.path2corpus, 'rb') as f:
diag_lst = pickle.load(f)
print_meta_data(args.path2corpus)
#%% prepare database_preparation for LDA-model-trainng
# prepare database_preparation for LDA-model-trainng
# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(diag_lst)
# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in diag_lst] #is that allready a model
#%% train LDA-model with different number of clusters
# train LDA-model with different number of clusters
if args.find_k_value:
limit=21; start=5; step=1
coherence_values = []
......@@ -72,7 +72,7 @@ if args.find_k_value:
n_cluster.append(len(np.unique(np.asarray(predictedCluster))))
print("coherence: " + str(coherencemodel.get_coherence()))
#%% visualize the results
# visualize the results
x = range(start, limit, step)
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
......@@ -92,7 +92,7 @@ if args.find_k_value:
plt.show()
exit()
#%% train LDA-model
# train LDA-model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=words,
num_topics=args.k_value,
......@@ -103,7 +103,7 @@ lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
per_word_topics=True)
#%% get topic weights / features
# get topic weights / features
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
topic_weights.append([w for i, w in row_list[0]])
......@@ -111,7 +111,7 @@ for i, row_list in enumerate(lda_model[corpus]):
# Array of topic weights
text_features = pd.DataFrame(topic_weights).fillna(0).values
#%% get prediction
# get prediction
predictedCluster= np.argmax(text_features, axis=1)
# and add it to the dataframe
......@@ -126,7 +126,7 @@ df['pcaX_LDA'] = reduced_features[:, 0]
df['pcaY_LDA'] = reduced_features[:, 1]
#%% and with umap
# and with umap
import umap
umap_text_features2D = umap.UMAP(n_neighbors=15,
n_components=2,
......@@ -136,7 +136,7 @@ df['umapX_LDA'] = umap_text_features2D[:, 0]
df['umapY_LDA'] = umap_text_features2D[:, 1]
df.to_pickle(args.df_cases_file)
#%% evalute the model
# evalute the model
from TextClustering.utils_metrics import ClusterMetrics
evaluation = ClusterMetrics(text_features, predictedCluster,
file_name= "TextClustering/cluster_metrics/LDA_metrics.pkl")
......
......@@ -6,26 +6,25 @@ import pickle
import openpyxl
# parameters:
df_cases_file = "database/df_cases.pkl"
df_cases_file = "database/bow_short_diag/bow_short_diag.df.pkl"
print_latex = False
filter_stop_words = True
path2umap_pics = 'TextClustering/plots/UMAP/'
save_umap_picture_in_table = True
path2corpus_bow_preprocessed = 'database/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed = 'database/embedding_prepro_diag.pkl'
stopword_filtered_corpus = True
####### functions ##########
def main():
cluster_sets = ['KMeans', 'LDA', 'HDBSCAN', 'German_BERT', 'Patho_BERT', 'top2vec', 'GSDPMM']
cluster_sets = ['HDBSCAN'] #['HDBSCAN', 'German_BERT', 'Patho_BERT', 'top2vec', 'GSDPMM']
# cluster_sets = ['German_BERT']
df_cases = pd.read_pickle(df_cases_file)
for cluster_set in cluster_sets:
# re-generate the topic words:
excel_file_path = 'TextClustering/tables/WordsPerCluster_' + cluster_set + '.xlsx'
excel_file_path = df_cases_file.replace('.df.pkl', f'_{cluster_set}_topwords.xlsx') #'TextClustering/tables/WordsPerCluster_' + cluster_set + '.xlsx'
# convert nan-values in int(-1):
try:
......@@ -34,20 +33,12 @@ def main():
print(f"skipping {cluster_set}. it is not in the df_cases_file.")
continue
if cluster_set in ['German_BERT', 'Patho_BERT', 'top2vec']:
text_corpus_path = path2corpus_embedding_preprocessed
else:
text_corpus_path = path2corpus_bow_preprocessed
meta_params = get_metadata(text_corpus_path)
with open(text_corpus_path, 'rb') as f:
diag_lst = pickle.load(f)
report_list = df_cases['preprocessed_text'].tolist()
# do not apply stopwordfilterg if it is already stopword filtered!
generate_save_topicwords(clusters, diag_lst, save_excel_file_path=excel_file_path,
generate_save_topicwords(clusters, report_list, save_excel_file_path=excel_file_path,
n_words=10, print_latex_table=print_latex,
filter_stop_words=filter_stop_words and not meta_params['stopword_filtered'])
filter_stop_words=filter_stop_words and not stopword_filtered_corpus)
if save_umap_picture_in_table:
pic_path = path2umap_pics + cluster_set + "_UMAP.png"
try:
......@@ -66,14 +57,14 @@ def main():
######### topic words of authors #########
clusters = label_list_as_int_list(df_cases['label_author'])
''' clusters = label_list_as_int_list(df_cases['label_author'])
excel_file_path = 'TextClustering/tables/WordsPerCluster_authors.xlsx'
with open(path2corpus_bow_preprocessed, 'rb') as f:
diag_lst = pickle.load(f)
report_list = pickle.load(f)
generate_save_topicwords(clusters, diag_lst, save_excel_file_path=excel_file_path,
generate_save_topicwords(clusters, report_list, save_excel_file_path=excel_file_path,
n_words=20, print_latex_table=print_latex,
filter_stop_words=False)
filter_stop_words=False)'''
if __name__ == '__main__':
......
......@@ -5,11 +5,13 @@ from database_preparation.utils_labeled_datasets import label_list_as_int_list
clustersets = ["GSDPMM", "KMeans", "LDA", "HDBSCAN",
"top2vec", "Patho_BERT", "German_BERT"]
clustersets = ["HDBSCAN"]
plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)",
"LDA (UMAP representation)", "HDBSCAN (UMAP representation)",
"top2vec (UMAP representation)", "Patho-BERT (UMAP representation)",
"German-BERT (UMAP representation)"]
df_cases_file = "database/df_cases.pkl"
plot_titles = ["HDBSCAN (UMAP representation)"]
df_cases_file = "database/bow_short_diag/bow_short_diag.df.pkl"
def save_umap_plot(clustersetname, df, title=None):
if not 'label_' + clustersetname in df:
......@@ -26,7 +28,7 @@ def save_umap_plot(clustersetname, df, title=None):
umap_text_features2D = np.transpose(umap_text_features2D)
cluster_scatter_plot(umap_text_features2D, predictedCluster_text_features,
"TextClustering/plots/UMAP/" + clustersetname + "_UMAP.png",
df_cases_file.replace('.df.pkl', '_') + clustersetname + "_UMAP.png",
show_plot=False, colorblindfriendly=False, fig_title=title)
if 'label_author' in df:
author_labels = df["label_author"]
......
from sklearn.feature_extraction.text import CountVectorizer
import joblib
import topicwizard
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline
'''
isntallation:
pip install topic-wizard
'''
if __name__ == '__main__':
vectorizer = CountVectorizer(min_df=5, max_df=0.8, stop_words="english")
model = NMF(n_components=10)
topic_pipeline = make_pipeline(vectorizer, model)
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset="all")
corpus = newsgroups.data
# Sklearn gives the labels back as integers, we have to map them back to
# the actual textual label.
group_labels = [newsgroups.target_names[label] for label in newsgroups.target]
topic_pipeline.fit(corpus)
print("launching topicwizard visualizer")
topicwizard.visualize(corpus, model=topic_pipeline)
exit(0)
\ No newline at end of file
import openpyxl
from TextClustering.utils_wordlist import get_top_cluster_words_as_latex_table
from googletrans import Translator # use pip install googletrans==3.1.0a0, 3.0 version is broken
from utils_general import custom_translation
path2table = "WordsPerCluster_HDBSCAN.xlsx"
......@@ -12,6 +12,13 @@ black = '1'
latex_weak_word = '\\weakcolor'
latex_strong_word = '\\strongcolor'
translate_to_eng = False
if translate_to_eng:
from googletrans import Translator # use pip install googletrans==3.1.0a0, 3.0 version is broken
from utils_general import custom_translation
def color2latex_color(color):
if color == green:
......@@ -49,9 +56,9 @@ def main():
extraction_methods = ['tf-idf', 'SVM']
cluster_method = 'HDBSCAN'
anotate_svm_as_tfidf = True
print_also_translated_tables = True
translator = Translator()
if translate_to_eng:
translator = Translator()
words_list_tfidf = []
colorstfidf = []
......@@ -82,7 +89,7 @@ def main():
print(latex)
# print english topic words:
if print_also_translated_tables:
if translate_to_eng:
description = f'Annotated topic words (translated from German to English), ' \
f'extracted from the {cluster_method} cluster-set, ' \
f'using the {extraction_method} based extraction method.'
......
......@@ -63,7 +63,7 @@ class ClusterMetrics:
df = pd.DataFrame(results,
index =['s-score', 'ch-index', 'db-score', 'cop', 'dunn-score', 'entropy', 'svm-accuracy'],
columns =[tail])
df.to_pickle(self.file_name)
df.to_csv(self.file_name.replace('.pkl', '') + ".csv")
print(df)
......
......@@ -278,6 +278,16 @@ def get_nwordlist(text_lst, cluster_lst, n_words=10,
def save_topwordlist_as_excel(file_path, token_list, sheet_name):
if bool(file_path):
token_list_dict = {}
for i_cluster, token_list in enumerate(token_list):
for i_top, token in enumerate(token_list):
key = f"top {i_top + 1}"
if key not in token_list_dict:
token_list_dict[key] = []
token_list_dict[key].append(token)
pandas.DataFrame(token_list_dict).to_csv(file_path.replace('.xlsx', '.csv'), index_label='cluster')
return
try: # if excelfile does exist, append new sheet to workbook:
excel_book = pxl.load_workbook(file_path)
if sheet_name in excel_book.get_sheet_names():
......@@ -296,6 +306,8 @@ def save_topwordlist_as_excel(file_path, token_list, sheet_name):
pandas.DataFrame(token_list).to_excel(file_path, sheet_name=sheet_name)
def generate_save_topicwords(predictedClusters, text_lst, save_excel_file_path,
n_words=10, print_latex_table=False,
extraction_methods=['TFIDF-based', 'frequency-based', 'yake', 'svm-based'],
......
{"source_data": "./database/diagnosis", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
{
"source_data": {"path_to_dataset": "database/nephro_reports_sectiondivided.newest", "sections": ["conclusion"]},
"tokenized": true,
"cased": false,
"stopword_filtered": true,
"use_combiner": true,
"use_replacer": true,
"lemma_mode": 3,
"punct_mode": 2,
"number_mode": 3
}
\ No newline at end of file
......@@ -12,7 +12,12 @@ from database_preparation.utils_wordbase import RegexpReplacer, RegexpSynonyms
from database_preparation.stop_word_list import filter_stopwords
import json
import argparse
import pandas as pd
'''
# installed: nltk, Hanta, tqdm, numpy
'''
########## define enums ##########
......@@ -118,7 +123,8 @@ def preprocess(parameter_dict):
Histo numbers and dates will always be removed!
"""
source_data_path = parameter_dict['source_data']
source_data_path = parameter_dict['source_data']['path_to_dataset']
sections_to_preprocess = parameter_dict['source_data']['sections']
do_tokenize = parameter_dict['tokenized']
cased = parameter_dict['cased']
stopword_filtered = parameter_dict['stopword_filtered']
......@@ -128,7 +134,6 @@ def preprocess(parameter_dict):
punct_mode = parameter_dict['punct_mode']
number_mode = parameter_dict['number_mode']
lemma_mode = LemmatizeMode(lemma_mode)
punct_mode = PunctuationMode(punct_mode)
number_mode = NumberMode(number_mode)
......@@ -137,17 +142,34 @@ def preprocess(parameter_dict):
replacer = RegexpReplacer()
tagger = ht.HanoverTagger('morphmodel_ger.pgz')
file_list = glob.glob(source_data_path + '/*.txt')
file_list = sorted(file_list, key=lambda f: int(f[f.find("#") + 1:-4]))
# load the files in a sorted way:
file_list = glob.glob(source_data_path + '/*.json')
file_list.sort()
preprocessed_corpus = []
preprocessed_corpus = {}
corpus = {}
random_example_idx = random.randrange(min(10, len(file_list)))
for idx, t_file in tqdm(enumerate(file_list)):
# %% load the txt-file
t_text = read_german_text(t_file)
# load the txt-file
# t_text = read_german_text(t_file)
case_id = t_file.split('/')[-1].replace('.json', '')
# load the json-file
with open(t_file) as json_file:
report = json.load(json_file)
t_text = ""
for section in sections_to_preprocess:
if section in report:
if report[section]:
t_text += '\n' + report[section]
if not t_text:
# print(f"Skipping file {t_file} because it does not contain any of the sections: {sections_to_preprocess}")
continue
original_text = t_text
......@@ -169,7 +191,7 @@ def preprocess(parameter_dict):
# lemmatize / stemming
t_text = tagger.tag_sent(t_text)
# %% lemmarize the text
# lemmarize the text
if lemma_mode == LemmatizeMode.lemma_only_nouns:
t_text = [lemma for (word, lemma, pos) in t_text if pos == "NN" or pos == "NE"]
elif lemma_mode == LemmatizeMode.lemma_only_nouns_adja:
......@@ -186,13 +208,13 @@ def preprocess(parameter_dict):
else: # none
t_text = [word for (word, lemma, pos) in t_text]
# %% filter punctuation:
# filter punctuation:
if punct_mode == PunctuationMode.remove:
t_text = [token for token in t_text if token not in punctuations_to_remove]
elif punct_mode == PunctuationMode.replace:
t_text = [token if token not in punctuations_to_remove else punct_replace_symbol for token in t_text]
# %% number filtering
# number filtering
filtered_text = []
use_single_symbol = True
for i, word in enumerate(t_text):
......@@ -280,8 +302,9 @@ def preprocess(parameter_dict):
if not do_tokenize:
t_text = ' '.join(t_text)
# %% add to the list
preprocessed_corpus.append(t_text)
# add to the list
preprocessed_corpus[case_id] = t_text
corpus[case_id] = original_text
if idx == random_example_idx:
print("-------------- Preprocessing Example: ---------------")
......@@ -291,7 +314,7 @@ def preprocess(parameter_dict):
print(t_text)
print("-----------------------------\n")
return preprocessed_corpus
return preprocessed_corpus, corpus
def main():
......@@ -300,9 +323,9 @@ def main():
sys.path.append(os.getcwd())
parser = argparse.ArgumentParser()
parser.add_argument("--path_to_preprocessing_params",
default='database/bow_prepro_diag_meta.json')
default='database/preprocessed_reports/bow_prepro_diag_config.json')
parser.add_argument("--target_path",
default='database/bow_prepro_diag.pkl')
default=None)
args = parser.parse_args()
with open(args.path_to_preprocessing_params) as json_file:
......@@ -312,12 +335,30 @@ def main():
print(prepro_params_2_string(params))
print()
preprocessed_corpus = preprocess(params)
preprocessed_corpus_dict, corpus_dict = preprocess(params)
#with open(args.target_path, 'wb') as f:
# pickle.dump(preprocessed_corpus_dict, f)
if args.target_path is None:
args.target_path = args.path_to_preprocessing_params.replace('_config', '').replace('config', '')
print(f"saved preprocessed corpus at {args.target_path}, containing {len(preprocessed_corpus_dict)} reports.")
print(f"Voctabulary size: {len(set([word for c_id in preprocessed_corpus_dict.keys() for word in preprocessed_corpus_dict[c_id]]))}")
with open(args.target_path, 'wb') as f:
pickle.dump(preprocessed_corpus, f)
corpus_as_table = {
'case_id': [c_id for c_id in preprocessed_corpus_dict.keys()],
'text': [corpus_dict[c_id] for c_id in corpus_dict.keys()],
'preprocessed_text': [preprocessed_corpus_dict[c_id] for c_id in preprocessed_corpus_dict.keys()]
}
print(f"saved preprocessed corpus at {args.target_path}")
# store results:
df = pd.DataFrame(corpus_as_table)
df.to_pickle(args.target_path.replace('.json', '.df.pkl'))
df.to_csv(args.target_path.replace('.json', '.df.csv'))
#with open(args.target_path, "w") as json_file:
# json.dump(preprocessed_corpus_dict, json_file, indent=4)
'''print(get_corpus_stats("../DataNephroTexts/description"))
print(get_corpus_stats("../DataNephroTexts/diagnosis"))
......
......@@ -5,8 +5,6 @@ import os
import sys
sys.path.append(os.getcwd())
import datasets
import pyarrow as pa
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold
......@@ -44,6 +42,9 @@ def text_label_2_labeled_dataset(texts, unfiltered_labels, print_infos=False):
- converts the passed text-label pair to datastes.Dataset type.
- returns dataset in format: {"text": labeled_texts, "label": labels}
'''
import datasets
import pyarrow as pa
# collect all text-label pairs, skipping unvalid labels
labeled_texts = []
labels = []
......@@ -148,6 +149,9 @@ def generate_save_hf_dataset(label_set="LDA", overwrite=True, lower=False):
and saves it under "./database/labeled_dataframes/labeld_dataset_" + label_set
'''
import datasets
import pyarrow as pa
dataset_path = "./database/labeled_dataframes/labeld_dataset_" + label_set
if os.path.exists(dataset_path):
......
numpy==1.21.0
gensim==4.2.0
pandas==1.4.2
matplotlib==3.5.1
tqdm==4.64.0
scikit-learn==1.1.1
hdbscan==0.8.28
nltk==3.7
seaborn==0.11.2
validclust==0.1.1
tensorflow-gpu==2.6.0
wordcloud==1.8.2.2
joblib==1.1.0
scipy==1.7.3
yake==0.4.8
openpyxl==3.0.10
googletrans==3.1.0a0
datasets==2.3.2
transformers==4.21.0.dev0
dataclasses==0.8
pyarrow==8.0.0
keras==2.6.0
torch==1.11.0
hanta==0.2.0
\ No newline at end of file
nltk # for preprocessing
Hanta
tqdm
numpy
scikit-learn # for clustering
matplotlib
pandas
umap-learn
hdbscan
validclust
pyarrow # for saving and evaluating data
datasets
pyldavis # for ui-supported topic modeling analysis
openpyxl
yake # for topicword search
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment