Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
NLP in diagnostic texts from nephropathology
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Maximilian Legnar
NLP in diagnostic texts from nephropathology
Commits
c7bab612
Commit
c7bab612
authored
Sep 26, 2022
by
max
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Now split_reports.py and cluster plots should work better with golden labels
parent
8ed471a1
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
52 additions
and
30 deletions
+52
-30
clusterset_histos.py
TextClustering/clusterset_histos.py
+16
-12
plot_clustersets.py
TextClustering/plot_clustersets.py
+7
-7
utils_metrics.py
TextClustering/utils_metrics.py
+6
-1
bow_prepro_desc_meta.json
database/bow_prepro_desc_meta.json
+1
-1
bow_prepro_diag_meta.json
database/bow_prepro_diag_meta.json
+1
-1
reportPreparationTools.py
database_preparation/reportPreparationTools.py
+1
-1
split_reports.py
database_preparation/split_reports.py
+20
-7
No files found.
TextClustering/clusterset_histos.py
View file @
c7bab612
...
@@ -14,10 +14,9 @@ plot_author_histos = False
...
@@ -14,10 +14,9 @@ plot_author_histos = False
cluster
=
2
cluster
=
2
clustersets
=
[
"HDBSCAN"
,
"KMeans"
,
"LDA"
,
"GSDPMM"
,
clustersets
=
[
"HDBSCAN"
,
"KMeans"
,
"LDA"
,
"GSDPMM"
,
"top2vec"
,
"Patho_BERT"
,
"German_BERT"
]
"top2vec"
,
"Patho_BERT"
,
"German_BERT"
,
"golden"
]
df
=
pd
.
read_pickle
(
args
.
df_cases_file
)
df
=
pd
.
read_pickle
(
args
.
df_cases_file
)
authors_labels
=
df
[
"label_author"
]
# plot histograms: how much docs do have the same label=cluster-index?
# plot histograms: how much docs do have the same label=cluster-index?
for
i
,
label_set
in
enumerate
(
clustersets
):
for
i
,
label_set
in
enumerate
(
clustersets
):
...
@@ -27,19 +26,24 @@ for i,label_set in enumerate(clustersets):
...
@@ -27,19 +26,24 @@ for i,label_set in enumerate(clustersets):
except
:
except
:
print
(
f
"skipping {label_set}. it is not in the df_cases_file."
)
print
(
f
"skipping {label_set}. it is not in the df_cases_file."
)
continue
continue
if
plot_author_histos
:
if
plot_author_histos
:
if
'label_author'
in
df
:
authors_labels
=
df
[
"label_author"
]
authors_of_cluster
=
[
authors_labels
[
i
]
for
i
,
label
in
enumerate
(
cluster_labels
)
if
authors_of_cluster
=
[
authors_labels
[
i
]
for
i
,
label
in
enumerate
(
cluster_labels
)
if
label
==
cluster
]
label
==
cluster
]
authors
=
np
.
asarray
(
authors_of_cluster
)
authors
=
np
.
asarray
(
authors_of_cluster
)
x
=
[
-
1
,
0
,
1
,
2
,
3
]
x
=
[
-
1
,
0
,
1
,
2
,
3
]
h
=
[]
h
=
[]
for
l
in
x
:
for
l
in
x
:
h
.
append
(
sum
([
1
for
a
in
authors
if
a
==
l
]))
h
.
append
(
sum
([
1
for
a
in
authors
if
a
==
l
]))
plt
.
bar
(
x
,
height
=
h
)
plt
.
bar
(
x
,
height
=
h
)
plt
.
title
(
label_set
+
" authors in cluster "
+
str
(
cluster
))
plt
.
title
(
label_set
+
" authors in cluster "
+
str
(
cluster
))
file_path
=
'TextClustering/plots/histograms/histogram_'
+
label_set
+
"_cluster"
+
str
(
cluster
)
+
"_authors.png"
file_path
=
'TextClustering/plots/histograms/histogram_'
+
label_set
+
"_cluster"
+
str
(
cluster
)
+
"_authors.png"
else
:
print
(
f
'Cant plot author histos, there is not "label_author" in df_cases.'
)
else
:
else
:
labels
=
np
.
asarray
([
l
for
l
in
cluster_labels
if
l
!=
-
1
])
labels
=
np
.
asarray
([
l
for
l
in
cluster_labels
if
l
!=
-
1
])
...
...
TextClustering/plot_clustersets.py
View file @
c7bab612
...
@@ -13,8 +13,7 @@ df_cases_file = "database/df_cases.pkl"
...
@@ -13,8 +13,7 @@ df_cases_file = "database/df_cases.pkl"
def
save_umap_plot
(
clustersetname
,
df
,
title
=
None
):
def
save_umap_plot
(
clustersetname
,
df
,
title
=
None
):
if
not
'label_'
+
clustersetname
in
df
:
if
not
'label_'
+
clustersetname
in
df
:
print
(
"skipping "
+
clustersetname
+
", it is not in df_cases_file:"
)
print
(
"skipping "
+
clustersetname
+
", it is not in df_cases_file."
)
print
(
df
)
return
return
predictedCluster_text_features
=
label_list_as_int_list
(
df
[
'label_'
+
clustersetname
])
predictedCluster_text_features
=
label_list_as_int_list
(
df
[
'label_'
+
clustersetname
])
...
@@ -39,7 +38,7 @@ def save_umap_plot(clustersetname, df, title=None):
...
@@ -39,7 +38,7 @@ def save_umap_plot(clustersetname, df, title=None):
golden_labels
=
df
[
"label_golden"
]
golden_labels
=
df
[
"label_golden"
]
cluster_scatter_plot
(
umap_text_features2D
,
golden_labels
,
cluster_scatter_plot
(
umap_text_features2D
,
golden_labels
,
"TextClustering/plots/UMAP/"
+
clustersetname
+
"_UMAP_goldenlabel.png"
,
"TextClustering/plots/UMAP/"
+
clustersetname
+
"_UMAP_goldenlabel.png"
,
show_plot
=
False
,
colorblindfriendly
=
Tru
e
show_plot
=
False
,
colorblindfriendly
=
Fals
e
,
fig_title
=
title
+
" colored with golden labels"
)
,
fig_title
=
title
+
" colored with golden labels"
)
def
main
():
def
main
():
...
@@ -63,10 +62,11 @@ def main():
...
@@ -63,10 +62,11 @@ def main():
"TextClustering/plots/PCA/LDA_PCA.png"
,
"TextClustering/plots/PCA/LDA_PCA.png"
,
show_plot
=
False
,
colorblindfriendly
=
False
,
show_plot
=
False
,
colorblindfriendly
=
False
,
fig_title
=
"LDA (PCA representation)"
)
fig_title
=
"LDA (PCA representation)"
)
cluster_scatter_plot
(
features2D
,
df
[
"label_author"
],
if
'label_author'
in
df
:
"TextClustering/plots/PCA/LDA_PCA_authors.png"
,
cluster_scatter_plot
(
features2D
,
df
[
"label_author"
],
show_plot
=
False
,
colorblindfriendly
=
True
,
"TextClustering/plots/PCA/LDA_PCA_authors.png"
,
number_data_points
=
False
,
fig_title
=
'LDA (PCA representation), colored by authors'
)
show_plot
=
False
,
colorblindfriendly
=
True
,
number_data_points
=
False
,
fig_title
=
'LDA (PCA representation), colored by authors'
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
main
()
main
()
TextClustering/utils_metrics.py
View file @
c7bab612
...
@@ -169,7 +169,12 @@ def cluster_scatter_plot(umap_text_features2D, labels, file_path = [],
...
@@ -169,7 +169,12 @@ def cluster_scatter_plot(umap_text_features2D, labels, file_path = [],
nummerate_clusters_in_plot
(
x
,
y
,
c
)
nummerate_clusters_in_plot
(
x
,
y
,
c
)
'''plt.legend(handles=scatter.legend_elements()[0],
'''plt.legend(handles=scatter.legend_elements()[0],
labels=[str(l) for l in c], loc="best")'''
labels=[str(l) for l in c], loc="best")'''
plt
.
colorbar
(
values
=
[
int
(
e
)
for
e
in
np
.
unique
(
np
.
asarray
(
c
))])
v
=
[
int
(
e
)
for
e
in
np
.
unique
(
np
.
asarray
(
c
))]
try
:
plt
.
colorbar
(
values
=
v
)
except
Exception
as
message
:
print
(
f
'failed to generate colorbar for {file_path}'
)
print
(
f
'{message}'
)
c
=
[
e
for
e
in
valid_labels
if
e
>
19
]
c
=
[
e
for
e
in
valid_labels
if
e
>
19
]
if
len
(
c
)
>
0
:
if
len
(
c
)
>
0
:
...
...
database/bow_prepro_desc_meta.json
View file @
c7bab612
{
"source_data"
:
"../DataNephroTexts/description"
,
"tokenized"
:
true
,
"cased"
:
false
,
"stopword_filtered"
:
true
,
"use_combiner"
:
true
,
"use_replacer"
:
true
,
"lemma_mode"
:
3
,
"punct_mode"
:
2
,
"number_mode"
:
3
}
{
"source_data"
:
"./database/description"
,
"tokenized"
:
true
,
"cased"
:
false
,
"stopword_filtered"
:
true
,
"use_combiner"
:
true
,
"use_replacer"
:
true
,
"lemma_mode"
:
3
,
"punct_mode"
:
2
,
"number_mode"
:
3
}
\ No newline at end of file
\ No newline at end of file
database/bow_prepro_diag_meta.json
View file @
c7bab612
{
"source_data"
:
"../DataNephroTexts/diagnosis"
,
"tokenized"
:
true
,
"cased"
:
false
,
"stopword_filtered"
:
true
,
"use_combiner"
:
true
,
"use_replacer"
:
true
,
"lemma_mode"
:
3
,
"punct_mode"
:
2
,
"number_mode"
:
3
}
{
"source_data"
:
"./database/diagnosis"
,
"tokenized"
:
true
,
"cased"
:
false
,
"stopword_filtered"
:
true
,
"use_combiner"
:
true
,
"use_replacer"
:
true
,
"lemma_mode"
:
3
,
"punct_mode"
:
2
,
"number_mode"
:
3
}
\ No newline at end of file
\ No newline at end of file
database_preparation/reportPreparationTools.py
View file @
c7bab612
...
@@ -19,7 +19,7 @@ def get3parts(t_file):
...
@@ -19,7 +19,7 @@ def get3parts(t_file):
#%% get start codons for description and diagnosis
#%% get start codons for description and diagnosis
# thinking that one pathologist sticks to his/her wording
# thinking that one pathologist sticks to his/her wording
start_codon_description
=
find_codon
(
t_text
,
start_codon_description
=
find_codon
(
t_text
,
[
'Lichtmikroskopie:'
,
'Mikroskopie:'
,
"Histologie:"
,
"Klinische Angaben:"
,
"Wir erhielten"
])
[
'Lichtmikroskopie:'
,
'Mikroskopie:'
,
"Histologie:"
,
"Klinische Angaben:"
,
"Wir erhielten
"
,
"Eingesandt wurde:
"
])
start_codon_2nd
=
find_codon
(
t_text
,
start_codon_2nd
=
find_codon
(
t_text
,
[
"Nachbericht"
,
"Immunhistochemie"
])
[
"Nachbericht"
,
"Immunhistochemie"
])
...
...
database_preparation/split_reports.py
View file @
c7bab612
...
@@ -41,6 +41,7 @@ parser.add_argument("--text_encoding",
...
@@ -41,6 +41,7 @@ parser.add_argument("--text_encoding",
parser
.
add_argument
(
"--use_newest_reports"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--use_newest_reports"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--label_in_filename_symbol"
,
parser
.
add_argument
(
"--label_in_filename_symbol"
,
default
=
''
)
default
=
''
)
parser
.
add_argument
(
"--save_labels_as_ints"
,
action
=
'store_true'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# %% get all files
# %% get all files
...
@@ -119,7 +120,8 @@ for idx, t_file in tqdm(enumerate(report_file_list)):
...
@@ -119,7 +120,8 @@ for idx, t_file in tqdm(enumerate(report_file_list)):
encoding
=
args
.
text_encoding
)
as
text_file
:
encoding
=
args
.
text_encoding
)
as
text_file
:
text_file
.
write
(
label
)
text_file
.
write
(
label
)
lst_labels
.
append
(
'label#'
+
str
(
idx
)
+
".txt"
)
# lst_labels.append('label#' + str(idx) + ".txt")
lst_labels
.
append
(
label
)
lst_description
.
append
(
'description#'
+
str
(
idx
)
+
".txt"
)
lst_description
.
append
(
'description#'
+
str
(
idx
)
+
".txt"
)
lst_diagnose
.
append
(
'diagnosis#'
+
str
(
idx
)
+
".txt"
)
lst_diagnose
.
append
(
'diagnosis#'
+
str
(
idx
)
+
".txt"
)
...
@@ -146,18 +148,29 @@ if args.label_in_filename_symbol == '':
...
@@ -146,18 +148,29 @@ if args.label_in_filename_symbol == '':
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
)),
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
)),
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
])
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
])
else
:
else
:
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
,
lst_labels
)),
if
args
.
save_labels_as_ints
:
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
,
'label_files'
])
class2int
=
dict
()
for
i
,
c
in
enumerate
(
list
(
set
(
lst_labels
))):
class2int
[
c
]
=
i
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
,
[
class2int
[
l
]
for
l
in
lst_labels
])),
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
,
'label_golden'
])
with
open
(
args
.
target_folder_path
+
"/classname2integer.txt"
,
"w"
)
as
text_file
:
text_file
.
write
(
'
\n
'
.
join
([
f
'{class2int[name]}
\t\t
{name}'
for
name
in
list
(
set
(
lst_labels
))]))
else
:
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
,
lst_labels
)),
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
,
'label_golden'
])
df
.
to_pickle
(
args
.
df_cases_file
)
df
.
to_pickle
(
args
.
df_cases_file
)
# search for authors in end-sections in order to add them as labels to the df_cases file:
# search for authors in end-sections in order to add them as labels to the df_cases file:
try
:
if
args
.
label_in_filename_symbol
==
''
:
add_author_labels_to_df_cases
(
args
.
target_folder_path
+
'/end'
,
args
.
author_names
.
split
(
' '
),
args
.
df_cases_file
)
try
:
except
:
add_author_labels_to_df_cases
(
args
.
target_folder_path
+
'/end'
,
args
.
author_names
.
split
(
' '
),
args
.
df_cases_file
)
print
(
"label the reports with authors failed."
)
except
:
print
(
"label the reports with authors failed."
)
df
=
pd
.
read_pickle
(
args
.
df_cases_file
)
df
=
pd
.
read_pickle
(
args
.
df_cases_file
)
print
(
f
"saved df_cases at {args.df_cases_file}
\n
"
)
print
(
f
"saved df_cases at {args.df_cases_file}
\n
"
)
print
(
f
'it looks like this:'
)
print
(
f
'it looks like this:'
)
print
(
df
)
print
(
df
)
print
(
f
"first element of {args.df_cases_file}:
\n
{df.iloc[0]}"
)
print
()
print
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment