Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
NLP in diagnostic texts from nephropathology
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Maximilian Legnar
NLP in diagnostic texts from nephropathology
Commits
e47051e4
Commit
e47051e4
authored
Dec 19, 2024
by
max-laptop
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
renamed folder "database" to "data"
parent
1245ae68
Changes
27
Hide whitespace changes
Inline
Side-by-side
Showing
27 changed files
with
69 additions
and
57 deletions
+69
-57
argsparse_classification_preamble.py
TextClassification/argsparse_classification_preamble.py
+1
-1
bow_classification.py
TextClassification/bow_classification.py
+2
-2
classification_for_cluster_evaluation.py
TextClassification/classification_for_cluster_evaluation.py
+11
-11
classification_pipeline.py
TextClassification/classification_pipeline.py
+2
-2
argsparse_clustering_preamble.py
TextClustering/argsparse_clustering_preamble.py
+2
-2
BERT_Diagnosis.py
TextClustering/basedOn_Embedding/BERT_Diagnosis.py
+1
-1
cluster_scores2latextable.py
TextClustering/cluster_scores2latextable.py
+3
-3
clustering_pipeline.py
TextClustering/clustering_pipeline.py
+2
-2
clusterset_histos.py
TextClustering/clusterset_histos.py
+1
-1
generate_topicwords.py
TextClustering/generate_topicwords.py
+1
-1
plot_clustersets.py
TextClustering/plot_clustersets.py
+2
-2
utils_wordlist.py
TextClustering/utils_wordlist.py
+1
-1
bow_prepro_desc_meta.json
data/bow_prepro_desc_meta.json
+0
-0
bow_short_diag_config.json
data/bow_short_diag/bow_short_diag_config.json
+0
-0
embedding_prepro_desc_meta.json
data/embedding_prepro_desc_meta.json
+0
-0
embedding_prepro_diag_meta.json
data/embedding_prepro_diag_meta.json
+0
-0
bow_prepro_diag_config.json
data/preprocessed_reports/bow_prepro_diag_config.json
+0
-0
count_oov_cases.py
database_preparation/count_oov_cases.py
+2
-2
data_preparation_pipeline.py
database_preparation/data_preparation_pipeline.py
+4
-4
label_reports_with_authors.py
database_preparation/label_reports_with_authors.py
+1
-1
preprocess.py
database_preparation/preprocess.py
+1
-1
save_vectorized_texts.py
database_preparation/save_vectorized_texts.py
+10
-10
split_reports.py
database_preparation/split_reports.py
+1
-1
utils_labeled_datasets.py
database_preparation/utils_labeled_datasets.py
+9
-9
README.md
topic_modeling/README.md
+11
-0
requirements.txt
topic_modeling/requirements.txt
+1
-0
topic-modeling-analysis.py
topic_modeling/topic-modeling-analysis.py
+0
-0
No files found.
TextClassification/argsparse_classification_preamble.py
View file @
e47051e4
...
...
@@ -9,7 +9,7 @@ def argsparse_preamble():
parser
.
add_argument
(
"--overwrite"
,
action
=
'store_true'
)
#False: generate data only if it doesn already exist
parser
.
add_argument
(
"--show_figures"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--clustered_data"
,
default
=
"HDBSCAN"
)
parser
.
add_argument
(
"--path2corpus"
,
default
=
"data
base
/bow_prepro_desc.pkl"
)
parser
.
add_argument
(
"--path2corpus"
,
default
=
"data/bow_prepro_desc.pkl"
)
parser
.
add_argument
(
"--loss_curve_check"
,
action
=
'store_true'
)
args
=
parser
.
parse_args
()
...
...
TextClassification/bow_classification.py
View file @
e47051e4
...
...
@@ -57,8 +57,8 @@ def get_immediate_subdirectories(a_dir):
def
cross_validate_with_bow_classifiers
(
label_set
,
fold_amount
=
10
,
path2corpus
=
"./data
base
/bow_prepro_desc.pkl"
,
df_cases_file
=
"data
base
/df_cases.pkl"
):
path2corpus
=
"./data/bow_prepro_desc.pkl"
,
df_cases_file
=
"data/df_cases.pkl"
):
'''
cross validates passed label_set with text data saved in path2corpus and labels saved in df_cases_file.
...
...
TextClassification/classification_for_cluster_evaluation.py
View file @
e47051e4
...
...
@@ -47,7 +47,7 @@ def create_pipeline(estimator, reduction=False):
steps
.
append
((
'classifier'
,
estimator
))
return
Pipeline
(
steps
)
def
cross_validate_with_simple_SVM
(
label_set
,
path2corpus
=
"./data
base/bow_prepro_diag.pkl"
,
path2dfcases
=
'./database
/df_cases.pkl'
):
def
cross_validate_with_simple_SVM
(
label_set
,
path2corpus
=
"./data
/bow_prepro_diag.pkl"
,
path2dfcases
=
'./data
/df_cases.pkl'
):
"""
trains a simple SVM with the given data
returns 10-fold-cross-validated accuracy value
...
...
@@ -96,7 +96,7 @@ def cross_validate_with_simple_SVM(label_set, path2corpus = "./database/bow_prep
return
metrics
def
cross_validate_label_corpus_with_simple_SVM
(
labels
,
path2corpus
=
"./data
base
/bow_prepro_diag.pkl"
,
sample
=
True
):
def
cross_validate_label_corpus_with_simple_SVM
(
labels
,
path2corpus
=
"./data/bow_prepro_diag.pkl"
,
sample
=
True
):
"""
trains a simple SVM with the given data
returns 10-fold-cross-validated accuracy value
...
...
@@ -145,7 +145,7 @@ def cross_validate_label_corpus_with_simple_SVM(labels, path2corpus = "./databas
return
np
.
mean
(
metrics
.
scores
[
'accuracy'
])
def
train_SVM_with_clusterset
(
label_set
,
path2corpus
=
"./data
base/bow_prepro_diag.pkl"
,
path2dfcases
=
'./database
/df_cases.pkl'
):
def
train_SVM_with_clusterset
(
label_set
,
path2corpus
=
"./data
/bow_prepro_diag.pkl"
,
path2dfcases
=
'./data
/df_cases.pkl'
):
"""
trains ans saves a svm, trained with the whole data under as:
"./ModelTestingAndExplaining/models/SVM_trained_with_" + label_set + "_clustered.pkl"
...
...
@@ -207,15 +207,15 @@ def update_cls_metric(label_set, cls_accuracy):
def
update_cls_metric_for_each_clusterset
():
'''
does 10-fold-cross-validation with a svm for each cluster-set saved in './data
base
/df_cases.pkl'
using always the text in 'data
base
/diag_lst_tokenized.pkl'
does 10-fold-cross-validation with a svm for each cluster-set saved in './data/df_cases.pkl'
using always the text in 'data/diag_lst_tokenized.pkl'
'''
label_sets
=
dt
.
get_all_label_set_ids
()
# label_sets = ["German_BERT"]
for
label_set
in
label_sets
:
accuracy
=
np
.
mean
(
cross_validate_with_simple_SVM
(
label_set
,
'data
base
/diag_lst_tokenized.pkl'
,
'./data
base
/df_cases.pkl'
)
.
scores
[
'accuracy'
])
'data/diag_lst_tokenized.pkl'
,
'./data/df_cases.pkl'
)
.
scores
[
'accuracy'
])
print
(
"svm-cls-accuracy of cluster set "
+
label_set
+
": "
+
str
(
accuracy
))
update_cls_metric
(
label_set
,
accuracy
)
...
...
@@ -223,10 +223,10 @@ def update_cls_metric_for_each_clusterset():
def
main
():
#update_cls_metric_for_each_clusterset()
cluster_set_name
=
"German_BERT"
#text_data = 'data
base/darmischaemie_prostata_txt_lst.pkl' cluster_set_dict = './database
/df_cases2.pkl'
text_data
=
'data
base
/diag_lst.pkl'
#text_data = 'data
base
/diag_lst_tokenized.pkl'
cluster_set_dict
=
'./data
base
/df_cases.pkl'
#text_data = 'data
/darmischaemie_prostata_txt_lst.pkl' cluster_set_dict = './data
/df_cases2.pkl'
text_data
=
'data/diag_lst.pkl'
#text_data = 'data/diag_lst_tokenized.pkl'
cluster_set_dict
=
'./data/df_cases.pkl'
train_SVM_with_clusterset
(
cluster_set_name
,
text_data
,
cluster_set_dict
)
...
...
TextClassification/classification_pipeline.py
View file @
e47051e4
...
...
@@ -6,8 +6,8 @@ import os
cluster_sets
=
[
'HDBSCAN'
]
# params:
path2corpus_bow_preprocessed
=
'data
base
/bow_prepro_desc.pkl'
path2corpus_embedding_preprocessed
=
'data
base
/embedding_prepro_desc.pkl'
path2corpus_bow_preprocessed
=
'data/bow_prepro_desc.pkl'
path2corpus_embedding_preprocessed
=
'data/embedding_prepro_desc.pkl'
#check working directory:
...
...
TextClustering/argsparse_clustering_preamble.py
View file @
e47051e4
...
...
@@ -11,8 +11,8 @@ def argsparse_preamble():
parser
.
add_argument
(
"--show_figures"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--model2use"
,
default
=
"German_BERT"
)
parser
.
add_argument
(
'--do_embedding'
,
action
=
'store_true'
)
parser
.
add_argument
(
"--path2corpus"
,
default
=
'data
base
/bow_prepro_diag.pkl'
)
parser
.
add_argument
(
"--df_cases_file"
,
default
=
'data
base
/df_cases.pkl'
)
parser
.
add_argument
(
"--path2corpus"
,
default
=
'data/bow_prepro_diag.pkl'
)
parser
.
add_argument
(
"--df_cases_file"
,
default
=
'data/df_cases.pkl'
)
args
=
parser
.
parse_args
()
...
...
TextClustering/basedOn_Embedding/BERT_Diagnosis.py
View file @
e47051e4
...
...
@@ -26,7 +26,7 @@ from sentence_transformers import SentenceTransformer
from
database_preparation.preprocess
import
print_meta_data
embedding_backup_folder
=
"data
base
/backup_files/"
embedding_backup_folder
=
"data/backup_files/"
if
not
os
.
path
.
isdir
(
embedding_backup_folder
):
os
.
makedirs
(
embedding_backup_folder
)
path_2_pathoBERT
=
"./LanguageModelling/ger-patho-bert-2"
...
...
TextClustering/cluster_scores2latextable.py
View file @
e47051e4
...
...
@@ -14,10 +14,10 @@ recalc_cls_accuracy = True
use_always_bow_data_for_svm_accuracy
=
True
sort_table_by
=
[
's-score'
]
# s-score or cls accuracy
table_save_path
=
'TextClustering/tables/cluster_metrics_overview'
path2corpus_bow_preprocessed
=
'data
base
/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed
=
'data
base
/embedding_prepro_diag.pkl'
path2corpus_bow_preprocessed
=
'data/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed
=
'data/embedding_prepro_diag.pkl'
scorepath
=
"TextClustering/cluster_metrics/"
df_cases_file
=
'./data
base
/df_cases.pkl'
df_cases_file
=
'./data/df_cases.pkl'
def
main
():
...
...
TextClustering/clustering_pipeline.py
View file @
e47051e4
import
os
,
sys
# params:
path2corpus_bow_preprocessed_diagnosis
=
'data
base
/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed_diagnosis
=
'data
base
/embedding_prepro_diag.pkl'
path2corpus_bow_preprocessed_diagnosis
=
'data/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed_diagnosis
=
'data/embedding_prepro_diag.pkl'
# check if we are at correct working directory:
workdir
=
os
.
getcwd
()
...
...
TextClustering/clusterset_histos.py
View file @
e47051e4
...
...
@@ -7,7 +7,7 @@ import argparse
sys
.
path
.
append
(
os
.
getcwd
())
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--df_cases_file"
,
default
=
"data
base
/df_cases.pkl"
)
parser
.
add_argument
(
"--df_cases_file"
,
default
=
"data/df_cases.pkl"
)
args
=
parser
.
parse_args
()
plot_author_histos
=
False
...
...
TextClustering/generate_topicwords.py
View file @
e47051e4
...
...
@@ -6,7 +6,7 @@ import pickle
import
openpyxl
# parameters:
df_cases_file
=
"data
base
/bow_short_diag/bow_short_diag.df.pkl"
df_cases_file
=
"data/bow_short_diag/bow_short_diag.df.pkl"
print_latex
=
False
filter_stop_words
=
True
path2umap_pics
=
'TextClustering/plots/UMAP/'
...
...
TextClustering/plot_clustersets.py
View file @
e47051e4
...
...
@@ -11,7 +11,7 @@ plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)",
"top2vec (UMAP representation)"
,
"Patho-BERT (UMAP representation)"
,
"German-BERT (UMAP representation)"
]
plot_titles
=
[
"HDBSCAN (UMAP representation)"
]
df_cases_file
=
"data
base
/bow_short_diag/bow_short_diag.df.pkl"
df_cases_file
=
"data/bow_short_diag/bow_short_diag.df.pkl"
def
save_umap_plot
(
clustersetname
,
df
,
title
=
None
):
if
not
'label_'
+
clustersetname
in
df
:
...
...
@@ -23,7 +23,7 @@ def save_umap_plot(clustersetname, df, title=None):
umap_text_features2D
=
np
.
asarray
([[
e
for
e
in
df
[
'umapX_'
+
clustersetname
]],
[
e
for
e
in
df
[
'umapY_'
+
clustersetname
]]])
except
:
print
(
"there is no umapX_"
+
clustersetname
+
" in data
base
/df_cases.pkl. => skipping"
)
print
(
"there is no umapX_"
+
clustersetname
+
" in data/df_cases.pkl. => skipping"
)
return
umap_text_features2D
=
np
.
transpose
(
umap_text_features2D
)
...
...
TextClustering/utils_wordlist.py
View file @
e47051e4
...
...
@@ -376,7 +376,7 @@ def print_excel_topciwordlist_as_latex(cluster_set, excel_file_path=None, topic_
if
excel_file_path
==
None
:
excel_file_path
=
'TextClustering/tables/WordsPerCluster_'
+
cluster_set
+
'_temp.xlsx'
df_cases
=
pd
.
read_pickle
(
"data
base
/df_cases.pkl"
)
df_cases
=
pd
.
read_pickle
(
"data/df_cases.pkl"
)
# convert nan-values in int(-1):
clusters
=
label_list_as_int_list
(
df_cases
[
'label_'
+
cluster_set
])
...
...
data
base
/bow_prepro_desc_meta.json
→
data/bow_prepro_desc_meta.json
View file @
e47051e4
File moved
data
base
/bow_short_diag/bow_short_diag_config.json
→
data/bow_short_diag/bow_short_diag_config.json
View file @
e47051e4
File moved
data
base
/embedding_prepro_desc_meta.json
→
data/embedding_prepro_desc_meta.json
View file @
e47051e4
File moved
data
base
/embedding_prepro_diag_meta.json
→
data/embedding_prepro_diag_meta.json
View file @
e47051e4
File moved
data
base
/preprocessed_reports/bow_prepro_diag_config.json
→
data/preprocessed_reports/bow_prepro_diag_config.json
View file @
e47051e4
File moved
database_preparation/count_oov_cases.py
View file @
e47051e4
...
...
@@ -4,8 +4,8 @@ import pickle
# script parameters:
modelname
=
"bert-base-german-cased"
path2corpus_embedding_preprocessed_diagnosis
=
'data
base
/embedding_prepro_diag.pkl'
path2corpus_embedding_preprocessed_description
=
'data
base
/embedding_prepro_desc.pkl'
path2corpus_embedding_preprocessed_diagnosis
=
'data/embedding_prepro_diag.pkl'
path2corpus_embedding_preprocessed_description
=
'data/embedding_prepro_desc.pkl'
tokenizer
=
AutoTokenizer
.
from_pretrained
(
modelname
)
model
=
AutoModelForMaskedLM
.
from_pretrained
(
modelname
)
...
...
database_preparation/data_preparation_pipeline.py
View file @
e47051e4
...
...
@@ -5,10 +5,10 @@ import os
path_to_reports
=
'../DataNephroTexts/reports'
author_names
=
"Name1 Name2 Name3 Name4"
## names of the pathologists who wrote the reports
splitted_reports_folder_path
=
'../DataNephroTexts'
path2corpus_bow_preprocessed_diagnosis
=
'data
base
/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed_diagnosis
=
'data
base
/embedding_prepro_diag.pkl'
path2corpus_bow_preprocessed_description
=
'data
base
/bow_prepro_desc.pkl'
path2corpus_embedding_preprocessed_description
=
'data
base
/embedding_prepro_desc.pkl'
path2corpus_bow_preprocessed_diagnosis
=
'data/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed_diagnosis
=
'data/embedding_prepro_diag.pkl'
path2corpus_bow_preprocessed_description
=
'data/bow_prepro_desc.pkl'
path2corpus_embedding_preprocessed_description
=
'data/embedding_prepro_desc.pkl'
# check if we are at correct working directory:
workdir
=
os
.
getcwd
()
...
...
database_preparation/label_reports_with_authors.py
View file @
e47051e4
...
...
@@ -34,7 +34,7 @@ def get_names(text):
names
.
append
(
token
)
return
names
def
add_author_labels_to_df_cases
(
path_to_end_sections
,
authors
,
df_cases_file
=
"data
base
/df_cases.pkl"
):
def
add_author_labels_to_df_cases
(
path_to_end_sections
,
authors
,
df_cases_file
=
"data/df_cases.pkl"
):
df
=
pd
.
read_pickle
(
df_cases_file
)
filenames
=
df
[
"end_text_files"
]
author_labels
=
[]
...
...
database_preparation/preprocess.py
View file @
e47051e4
...
...
@@ -323,7 +323,7 @@ def main():
sys
.
path
.
append
(
os
.
getcwd
())
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--path_to_preprocessing_params"
,
default
=
'data
base
/preprocessed_reports/bow_prepro_diag_config.json'
)
default
=
'data/preprocessed_reports/bow_prepro_diag_config.json'
)
parser
.
add_argument
(
"--target_path"
,
default
=
None
)
args
=
parser
.
parse_args
()
...
...
database_preparation/save_vectorized_texts.py
View file @
e47051e4
...
...
@@ -6,16 +6,16 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from
database_preparation.preprocess
import
print_meta_data
,
prepro_params_2_string
# parameters:
df_cases_file
=
"data
base
/df_cases.pkl"
text_corpus_paths
=
[
'data
base
/embedding_prepro_diag.pkl'
,
'data
base
/bow_prepro_diag.pkl'
,
'data
base
/embedding_prepro_desc.pkl'
,
'data
base
/bow_prepro_desc.pkl'
]
vector_corpus_paths
=
[
'data
base
/diagnosis_texts_vectorized_DR_preprocessed.pkl'
,
'data
base
/diagnosis_texts_vectorized_bow_preprocessed.pkl'
,
'data
base
/description_texts_vectorized_DR_preprocessed.pkl'
,
'data
base
/description_texts_vectorized_bow_preprocessed.pkl'
]
df_cases_file
=
"data/df_cases.pkl"
text_corpus_paths
=
[
'data/embedding_prepro_diag.pkl'
,
'data/bow_prepro_diag.pkl'
,
'data/embedding_prepro_desc.pkl'
,
'data/bow_prepro_desc.pkl'
]
vector_corpus_paths
=
[
'data/diagnosis_texts_vectorized_DR_preprocessed.pkl'
,
'data/diagnosis_texts_vectorized_bow_preprocessed.pkl'
,
'data/description_texts_vectorized_DR_preprocessed.pkl'
,
'data/description_texts_vectorized_bow_preprocessed.pkl'
]
####### functions ##########
...
...
database_preparation/split_reports.py
View file @
e47051e4
...
...
@@ -33,7 +33,7 @@ parser.add_argument("--path_to_reports",
parser
.
add_argument
(
"--target_folder_path"
,
default
=
'../DataNephroTexts'
)
parser
.
add_argument
(
"--df_cases_file"
,
default
=
'data
base
/df_cases.pkl'
)
default
=
'data/df_cases.pkl'
)
parser
.
add_argument
(
"--author_names"
,
default
=
"Name1 Name2"
)
parser
.
add_argument
(
"--text_encoding"
,
...
...
database_preparation/utils_labeled_datasets.py
View file @
e47051e4
...
...
@@ -74,8 +74,8 @@ def text_label_2_labeled_dataset(texts, unfiltered_labels, print_infos=False):
def
text_label_files_to_labeled_dataset
(
label_set
,
path2corpus
=
"./data
base
/bow_prepro_desc.pkl"
,
df_cases_path
=
"./data
base
/df_cases.pkl"
,
print_infos
=
False
):
path2corpus
=
"./data/bow_prepro_desc.pkl"
,
df_cases_path
=
"./data/df_cases.pkl"
,
print_infos
=
False
):
'''
- sorts out outliear-documents (which belongs to cluster .1 or cluster Nonde)
- converts the pandas dataframe to datastes.Dataset type.
...
...
@@ -89,11 +89,11 @@ def text_label_files_to_labeled_dataset(label_set,
def
get_all_label_set_ids
():
df
=
pd
.
read_pickle
(
"./data
base
/df_cases.pkl"
)
df
=
pd
.
read_pickle
(
"./data/df_cases.pkl"
)
return
[
e
[
6
:]
for
e
in
df
.
columns
if
"label_"
in
e
]
def
get_filename_label_tuple
(
label_set
,
get_micro_txt
=
True
,
df_cases_file
=
"./data
base
/df_cases.pkl"
):
def
get_filename_label_tuple
(
label_set
,
get_micro_txt
=
True
,
df_cases_file
=
"./data/df_cases.pkl"
):
'''
returns textfilename_list, label_lists as ([filenames],[labels, as int list]))
it will contain outlier labels (they have value None or -1)
...
...
@@ -110,7 +110,7 @@ def get_filename_label_tuple(label_set, get_micro_txt=True, df_cases_file="./dat
return
df_cases
[
"diagnosis_text_files"
],
int_labels
def
get_amount_unique_labels
(
label_set
,
df_cases_file
=
"./data
base
/df_cases.pkl"
):
def
get_amount_unique_labels
(
label_set
,
df_cases_file
=
"./data/df_cases.pkl"
):
'''
returns amount unique labels (does not count nan or -1 classes!!!).
If label_set does not exist, you will get
...
...
@@ -146,13 +146,13 @@ def generate_save_hf_dataset(label_set="LDA", overwrite=True, lower=False):
'''
Generate a labeled dataset of type datasets.Dataset
(datasets is a library from huggingface)
and saves it under "./data
base
/labeled_dataframes/labeld_dataset_" + label_set
and saves it under "./data/labeled_dataframes/labeld_dataset_" + label_set
'''
import
datasets
import
pyarrow
as
pa
dataset_path
=
"./data
base
/labeled_dataframes/labeld_dataset_"
+
label_set
dataset_path
=
"./data/labeled_dataframes/labeld_dataset_"
+
label_set
if
os
.
path
.
exists
(
dataset_path
):
print
(
dataset_path
+
" already exists."
)
...
...
@@ -162,11 +162,11 @@ def generate_save_hf_dataset(label_set="LDA", overwrite=True, lower=False):
print
(
"skipping generation of "
+
dataset_path
)
return
df_cases
=
pd
.
read_pickle
(
"./data
base
/df_cases.pkl"
)
df_cases
=
pd
.
read_pickle
(
"./data/df_cases.pkl"
)
# print(df_cases.columns)
# collect all text-label pairs, skipping unvalid labels!
diag_text_rokenized
=
pd
.
read_pickle
(
"./data
base
/diag_lst_tokenized.pkl"
)
diag_text_rokenized
=
pd
.
read_pickle
(
"./data/diag_lst_tokenized.pkl"
)
texts
=
[]
labels
=
[]
diagnoses
=
[]
...
...
topic_modeling/README.md
0 → 100644
View file @
e47051e4
# Topic modeling loop project
## installation
Create a new environment, then install the required python packages with:
```pip install -r requirements.txt```
## usage
... still in experimenting phase...
\ No newline at end of file
topic_modeling/requirements.txt
0 → 100644
View file @
e47051e4
topic-wizard
\ No newline at end of file
TextCluster
ing/topic-modeling-analysis.py
→
topic_model
ing/topic-modeling-analysis.py
View file @
e47051e4
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment