Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
NLP in diagnostic texts from nephropathology
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Maximilian Legnar
NLP in diagnostic texts from nephropathology
Commits
d3fb7ac6
Commit
d3fb7ac6
authored
Dec 19, 2024
by
max-laptop
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added a pipeline for clustering on our new data-structures (see custom_topic_modeling_pipeline.py)
parent
e47051e4
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
60 additions
and
10 deletions
+60
-10
generate_topicwords.py
TextClustering/generate_topicwords.py
+4
-0
plot_clustersets.py
TextClustering/plot_clustersets.py
+6
-0
bow_diag_config.json
data/bow_diag_clustering/bow_diag_config.json
+1
-1
bow_merged_comments_config.json
data/bow_merged_comments/bow_merged_comments_config.json
+11
-0
bow_short_diag_config.json
data/bow_short_diag/bow_short_diag_config.json
+1
-1
preprocess.py
database_preparation/preprocess.py
+1
-1
custom_topic_modeling_pipeline.py
topic_modeling/custom_topic_modeling_pipeline.py
+34
-0
topic-modeling-analysis.py
topic_modeling/topic-modeling-analysis.py
+2
-7
No files found.
TextClustering/generate_topicwords.py
View file @
d3fb7ac6
...
@@ -4,9 +4,13 @@ from database_preparation.utils_labeled_datasets import label_list_as_int_list
...
@@ -4,9 +4,13 @@ from database_preparation.utils_labeled_datasets import label_list_as_int_list
from
database_preparation.preprocess
import
get_metadata
from
database_preparation.preprocess
import
get_metadata
import
pickle
import
pickle
import
openpyxl
import
openpyxl
import
os
,
sys
argv
=
sys
.
argv
# parameters:
# parameters:
df_cases_file
=
"data/bow_short_diag/bow_short_diag.df.pkl"
df_cases_file
=
"data/bow_short_diag/bow_short_diag.df.pkl"
df_cases_file
=
argv
[
1
]
print_latex
=
False
print_latex
=
False
filter_stop_words
=
True
filter_stop_words
=
True
path2umap_pics
=
'TextClustering/plots/UMAP/'
path2umap_pics
=
'TextClustering/plots/UMAP/'
...
...
TextClustering/plot_clustersets.py
View file @
d3fb7ac6
...
@@ -2,6 +2,7 @@ import pandas as pd
...
@@ -2,6 +2,7 @@ import pandas as pd
from
TextClustering.utils_metrics
import
cluster_scatter_plot
from
TextClustering.utils_metrics
import
cluster_scatter_plot
import
numpy
as
np
import
numpy
as
np
from
database_preparation.utils_labeled_datasets
import
label_list_as_int_list
from
database_preparation.utils_labeled_datasets
import
label_list_as_int_list
import
sys
,
os
clustersets
=
[
"GSDPMM"
,
"KMeans"
,
"LDA"
,
"HDBSCAN"
,
clustersets
=
[
"GSDPMM"
,
"KMeans"
,
"LDA"
,
"HDBSCAN"
,
"top2vec"
,
"Patho_BERT"
,
"German_BERT"
]
"top2vec"
,
"Patho_BERT"
,
"German_BERT"
]
...
@@ -11,8 +12,13 @@ plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)",
...
@@ -11,8 +12,13 @@ plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)",
"top2vec (UMAP representation)"
,
"Patho-BERT (UMAP representation)"
,
"top2vec (UMAP representation)"
,
"Patho-BERT (UMAP representation)"
,
"German-BERT (UMAP representation)"
]
"German-BERT (UMAP representation)"
]
plot_titles
=
[
"HDBSCAN (UMAP representation)"
]
plot_titles
=
[
"HDBSCAN (UMAP representation)"
]
argv
=
sys
.
argv
df_cases_file
=
"data/bow_short_diag/bow_short_diag.df.pkl"
df_cases_file
=
"data/bow_short_diag/bow_short_diag.df.pkl"
df_cases_file
=
argv
[
1
]
def
save_umap_plot
(
clustersetname
,
df
,
title
=
None
):
def
save_umap_plot
(
clustersetname
,
df
,
title
=
None
):
if
not
'label_'
+
clustersetname
in
df
:
if
not
'label_'
+
clustersetname
in
df
:
print
(
"skipping "
+
clustersetname
+
", it is not in df_cases_file."
)
print
(
"skipping "
+
clustersetname
+
", it is not in df_cases_file."
)
...
...
data/
preprocessed_reports/bow_prepro
_diag_config.json
→
data/
bow_diag_clustering/bow
_diag_config.json
View file @
d3fb7ac6
{
{
"source_data"
:
{
"path_to_dataset"
:
"data
base
/nephro_reports_sectiondivided.newest"
,
"sections"
:
[
"conclusion"
]},
"source_data"
:
{
"path_to_dataset"
:
"data/nephro_reports_sectiondivided.newest"
,
"sections"
:
[
"conclusion"
]},
"tokenized"
:
true
,
"tokenized"
:
true
,
"cased"
:
false
,
"cased"
:
false
,
"stopword_filtered"
:
true
,
"stopword_filtered"
:
true
,
...
...
data/bow_merged_comments/bow_merged_comments_config.json
0 → 100644
View file @
d3fb7ac6
{
"source_data"
:
{
"path_to_dataset"
:
"data/nephro_reports_sectiondivided.newest"
,
"sections"
:
[
"short_diagnose"
,
"short_symptoms"
,
"CD73_result"
]},
"tokenized"
:
true
,
"cased"
:
true
,
"stopword_filtered"
:
false
,
"use_combiner"
:
false
,
"use_replacer"
:
false
,
"lemma_mode"
:
4
,
"punct_mode"
:
2
,
"number_mode"
:
1
}
\ No newline at end of file
data/bow_short_diag/bow_short_diag_config.json
View file @
d3fb7ac6
{
{
"source_data"
:
{
"path_to_dataset"
:
"data
base
/nephro_reports_sectiondivided.newest"
,
"sections"
:
[
"short_diagnose"
]},
"source_data"
:
{
"path_to_dataset"
:
"data/nephro_reports_sectiondivided.newest"
,
"sections"
:
[
"short_diagnose"
]},
"tokenized"
:
true
,
"tokenized"
:
true
,
"cased"
:
true
,
"cased"
:
true
,
"stopword_filtered"
:
false
,
"stopword_filtered"
:
false
,
...
...
database_preparation/preprocess.py
View file @
d3fb7ac6
...
@@ -323,7 +323,7 @@ def main():
...
@@ -323,7 +323,7 @@ def main():
sys
.
path
.
append
(
os
.
getcwd
())
sys
.
path
.
append
(
os
.
getcwd
())
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--path_to_preprocessing_params"
,
parser
.
add_argument
(
"--path_to_preprocessing_params"
,
default
=
'data/
preprocessed_reports/bow_prepro
_diag_config.json'
)
default
=
'data/
bow_diag_clustering/bow
_diag_config.json'
)
parser
.
add_argument
(
"--target_path"
,
parser
.
add_argument
(
"--target_path"
,
default
=
None
)
default
=
None
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
...
...
topic_modeling/custom_topic_modeling_pipeline.py
0 → 100644
View file @
d3fb7ac6
import
os
,
sys
# params:
#confi_file = "data/bow_short_diag/bow_short_diag_config.json"
#confi_file = "data/bow_diag_clustering/bow_diag_config.json"
confi_file
=
"data/bow_merged_comments/bow_merged_comments_config.json"
df_file
=
confi_file
.
replace
(
'.json'
,
'.df.pkl'
)
.
replace
(
'_config'
,
''
)
# check if we are at correct working directory:
workdir
=
os
.
getcwd
()
if
not
workdir
[
-
len
(
'nlp-in-diagnostic-texts-from-nephropathology'
):]
==
'nlp-in-diagnostic-texts-from-nephropathology'
:
print
(
workdir
+
" is the wrong working directory."
)
print
(
"please make shure to run this script with working directory '.../path/to/nlp-in-diagnostic-texts-from-nephropathology'."
)
exit
(
1
)
# Construct clustering pipeline. This is a suggestion how to use all the scripts.
# I also recommend to run each clustering script one by on to fintune the clusterings (with argument --find_k_value)
script_queue
=
[
f
"python database_preparation/preprocess.py --path_to_preprocessing_params {confi_file}"
,
f
"python TextClustering/basedOn_BOW/HDBSCAN_Diagnosis.py --path2corpus {df_file} --k_value {20}"
,
f
"python TextClustering/plot_clustersets.py '{df_file}'"
,
f
"python TextClustering/generate_topicwords.py '{df_file}'"
,
]
for
script
in
script_queue
:
print
(
"
\n
########################################### executing ###########################################"
)
print
(
script
)
print
(
"####################################################################################################
\n
"
)
os
.
system
(
script
)
topic_modeling/topic-modeling-analysis.py
View file @
d3fb7ac6
...
@@ -17,16 +17,11 @@ if __name__ == '__main__':
...
@@ -17,16 +17,11 @@ if __name__ == '__main__':
topic_pipeline
=
make_pipeline
(
vectorizer
,
model
)
topic_pipeline
=
make_pipeline
(
vectorizer
,
model
)
from
sklearn.datasets
import
fetch_20newsgroups
newsgroups
=
fetch_20newsgroups
(
subset
=
"all"
)
corpus
=
newsgroups
.
data
# Sklearn gives the labels back as integers, we have to map them back to
# the actual textual label.
group_labels
=
[
newsgroups
.
target_names
[
label
]
for
label
in
newsgroups
.
target
]
topic_pipeline
.
fit
(
corpus
)
topic_pipeline
.
fit
(
corpus
)
print
(
"launching topicwizard visualizer"
)
print
(
"launching topicwizard visualizer
(may take a while)...
"
)
topicwizard
.
visualize
(
corpus
,
model
=
topic_pipeline
)
topicwizard
.
visualize
(
corpus
,
model
=
topic_pipeline
)
exit
(
0
)
exit
(
0
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment