Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
NLP in diagnostic texts from nephropathology
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Maximilian Legnar
NLP in diagnostic texts from nephropathology
Commits
c7bab612
Commit
c7bab612
authored
2 years ago
by
max
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Now split_reports.py and cluster plots should work better with golden labels
parent
8ed471a1
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
52 additions
and
30 deletions
+52
-30
clusterset_histos.py
TextClustering/clusterset_histos.py
+16
-12
plot_clustersets.py
TextClustering/plot_clustersets.py
+7
-7
utils_metrics.py
TextClustering/utils_metrics.py
+6
-1
bow_prepro_desc_meta.json
database/bow_prepro_desc_meta.json
+1
-1
bow_prepro_diag_meta.json
database/bow_prepro_diag_meta.json
+1
-1
reportPreparationTools.py
database_preparation/reportPreparationTools.py
+1
-1
split_reports.py
database_preparation/split_reports.py
+20
-7
No files found.
TextClustering/clusterset_histos.py
View file @
c7bab612
...
...
@@ -14,10 +14,9 @@ plot_author_histos = False
cluster
=
2
clustersets
=
[
"HDBSCAN"
,
"KMeans"
,
"LDA"
,
"GSDPMM"
,
"top2vec"
,
"Patho_BERT"
,
"German_BERT"
]
"top2vec"
,
"Patho_BERT"
,
"German_BERT"
,
"golden"
]
df
=
pd
.
read_pickle
(
args
.
df_cases_file
)
authors_labels
=
df
[
"label_author"
]
# plot histograms: how much docs do have the same label=cluster-index?
for
i
,
label_set
in
enumerate
(
clustersets
):
...
...
@@ -27,19 +26,24 @@ for i,label_set in enumerate(clustersets):
except
:
print
(
f
"skipping {label_set}. it is not in the df_cases_file."
)
continue
if
plot_author_histos
:
if
'label_author'
in
df
:
authors_labels
=
df
[
"label_author"
]
authors_of_cluster
=
[
authors_labels
[
i
]
for
i
,
label
in
enumerate
(
cluster_labels
)
if
label
==
cluster
]
authors
=
np
.
asarray
(
authors_of_cluster
)
x
=
[
-
1
,
0
,
1
,
2
,
3
]
h
=
[]
for
l
in
x
:
h
.
append
(
sum
([
1
for
a
in
authors
if
a
==
l
]))
plt
.
bar
(
x
,
height
=
h
)
authors_of_cluster
=
[
authors_labels
[
i
]
for
i
,
label
in
enumerate
(
cluster_labels
)
if
label
==
cluster
]
authors
=
np
.
asarray
(
authors_of_cluster
)
x
=
[
-
1
,
0
,
1
,
2
,
3
]
h
=
[]
for
l
in
x
:
h
.
append
(
sum
([
1
for
a
in
authors
if
a
==
l
]))
plt
.
bar
(
x
,
height
=
h
)
plt
.
title
(
label_set
+
" authors in cluster "
+
str
(
cluster
))
file_path
=
'TextClustering/plots/histograms/histogram_'
+
label_set
+
"_cluster"
+
str
(
cluster
)
+
"_authors.png"
plt
.
title
(
label_set
+
" authors in cluster "
+
str
(
cluster
))
file_path
=
'TextClustering/plots/histograms/histogram_'
+
label_set
+
"_cluster"
+
str
(
cluster
)
+
"_authors.png"
else
:
print
(
f
'Cant plot author histos, there is not "label_author" in df_cases.'
)
else
:
labels
=
np
.
asarray
([
l
for
l
in
cluster_labels
if
l
!=
-
1
])
...
...
This diff is collapsed.
Click to expand it.
TextClustering/plot_clustersets.py
View file @
c7bab612
...
...
@@ -13,8 +13,7 @@ df_cases_file = "database/df_cases.pkl"
def
save_umap_plot
(
clustersetname
,
df
,
title
=
None
):
if
not
'label_'
+
clustersetname
in
df
:
print
(
"skipping "
+
clustersetname
+
", it is not in df_cases_file:"
)
print
(
df
)
print
(
"skipping "
+
clustersetname
+
", it is not in df_cases_file."
)
return
predictedCluster_text_features
=
label_list_as_int_list
(
df
[
'label_'
+
clustersetname
])
...
...
@@ -39,7 +38,7 @@ def save_umap_plot(clustersetname, df, title=None):
golden_labels
=
df
[
"label_golden"
]
cluster_scatter_plot
(
umap_text_features2D
,
golden_labels
,
"TextClustering/plots/UMAP/"
+
clustersetname
+
"_UMAP_goldenlabel.png"
,
show_plot
=
False
,
colorblindfriendly
=
Tru
e
show_plot
=
False
,
colorblindfriendly
=
Fals
e
,
fig_title
=
title
+
" colored with golden labels"
)
def
main
():
...
...
@@ -63,10 +62,11 @@ def main():
"TextClustering/plots/PCA/LDA_PCA.png"
,
show_plot
=
False
,
colorblindfriendly
=
False
,
fig_title
=
"LDA (PCA representation)"
)
cluster_scatter_plot
(
features2D
,
df
[
"label_author"
],
"TextClustering/plots/PCA/LDA_PCA_authors.png"
,
show_plot
=
False
,
colorblindfriendly
=
True
,
number_data_points
=
False
,
fig_title
=
'LDA (PCA representation), colored by authors'
)
if
'label_author'
in
df
:
cluster_scatter_plot
(
features2D
,
df
[
"label_author"
],
"TextClustering/plots/PCA/LDA_PCA_authors.png"
,
show_plot
=
False
,
colorblindfriendly
=
True
,
number_data_points
=
False
,
fig_title
=
'LDA (PCA representation), colored by authors'
)
if
__name__
==
'__main__'
:
main
()
This diff is collapsed.
Click to expand it.
TextClustering/utils_metrics.py
View file @
c7bab612
...
...
@@ -169,7 +169,12 @@ def cluster_scatter_plot(umap_text_features2D, labels, file_path = [],
nummerate_clusters_in_plot
(
x
,
y
,
c
)
'''plt.legend(handles=scatter.legend_elements()[0],
labels=[str(l) for l in c], loc="best")'''
plt
.
colorbar
(
values
=
[
int
(
e
)
for
e
in
np
.
unique
(
np
.
asarray
(
c
))])
v
=
[
int
(
e
)
for
e
in
np
.
unique
(
np
.
asarray
(
c
))]
try
:
plt
.
colorbar
(
values
=
v
)
except
Exception
as
message
:
print
(
f
'failed to generate colorbar for {file_path}'
)
print
(
f
'{message}'
)
c
=
[
e
for
e
in
valid_labels
if
e
>
19
]
if
len
(
c
)
>
0
:
...
...
This diff is collapsed.
Click to expand it.
database/bow_prepro_desc_meta.json
View file @
c7bab612
{
"source_data"
:
"../DataNephroTexts/description"
,
"tokenized"
:
true
,
"cased"
:
false
,
"stopword_filtered"
:
true
,
"use_combiner"
:
true
,
"use_replacer"
:
true
,
"lemma_mode"
:
3
,
"punct_mode"
:
2
,
"number_mode"
:
3
}
\ No newline at end of file
{
"source_data"
:
"./database/description"
,
"tokenized"
:
true
,
"cased"
:
false
,
"stopword_filtered"
:
true
,
"use_combiner"
:
true
,
"use_replacer"
:
true
,
"lemma_mode"
:
3
,
"punct_mode"
:
2
,
"number_mode"
:
3
}
\ No newline at end of file
This diff is collapsed.
Click to expand it.
database/bow_prepro_diag_meta.json
View file @
c7bab612
{
"source_data"
:
"../DataNephroTexts/diagnosis"
,
"tokenized"
:
true
,
"cased"
:
false
,
"stopword_filtered"
:
true
,
"use_combiner"
:
true
,
"use_replacer"
:
true
,
"lemma_mode"
:
3
,
"punct_mode"
:
2
,
"number_mode"
:
3
}
\ No newline at end of file
{
"source_data"
:
"./database/diagnosis"
,
"tokenized"
:
true
,
"cased"
:
false
,
"stopword_filtered"
:
true
,
"use_combiner"
:
true
,
"use_replacer"
:
true
,
"lemma_mode"
:
3
,
"punct_mode"
:
2
,
"number_mode"
:
3
}
\ No newline at end of file
This diff is collapsed.
Click to expand it.
database_preparation/reportPreparationTools.py
View file @
c7bab612
...
...
@@ -19,7 +19,7 @@ def get3parts(t_file):
#%% get start codons for description and diagnosis
# thinking that one pathologist sticks to his/her wording
start_codon_description
=
find_codon
(
t_text
,
[
'Lichtmikroskopie:'
,
'Mikroskopie:'
,
"Histologie:"
,
"Klinische Angaben:"
,
"Wir erhielten"
])
[
'Lichtmikroskopie:'
,
'Mikroskopie:'
,
"Histologie:"
,
"Klinische Angaben:"
,
"Wir erhielten
"
,
"Eingesandt wurde:
"
])
start_codon_2nd
=
find_codon
(
t_text
,
[
"Nachbericht"
,
"Immunhistochemie"
])
...
...
This diff is collapsed.
Click to expand it.
database_preparation/split_reports.py
View file @
c7bab612
...
...
@@ -41,6 +41,7 @@ parser.add_argument("--text_encoding",
parser
.
add_argument
(
"--use_newest_reports"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--label_in_filename_symbol"
,
default
=
''
)
parser
.
add_argument
(
"--save_labels_as_ints"
,
action
=
'store_true'
)
args
=
parser
.
parse_args
()
# %% get all files
...
...
@@ -119,7 +120,8 @@ for idx, t_file in tqdm(enumerate(report_file_list)):
encoding
=
args
.
text_encoding
)
as
text_file
:
text_file
.
write
(
label
)
lst_labels
.
append
(
'label#'
+
str
(
idx
)
+
".txt"
)
# lst_labels.append('label#' + str(idx) + ".txt")
lst_labels
.
append
(
label
)
lst_description
.
append
(
'description#'
+
str
(
idx
)
+
".txt"
)
lst_diagnose
.
append
(
'diagnosis#'
+
str
(
idx
)
+
".txt"
)
...
...
@@ -146,18 +148,29 @@ if args.label_in_filename_symbol == '':
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
)),
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
])
else
:
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
,
lst_labels
)),
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
,
'label_files'
])
if
args
.
save_labels_as_ints
:
class2int
=
dict
()
for
i
,
c
in
enumerate
(
list
(
set
(
lst_labels
))):
class2int
[
c
]
=
i
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
,
[
class2int
[
l
]
for
l
in
lst_labels
])),
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
,
'label_golden'
])
with
open
(
args
.
target_folder_path
+
"/classname2integer.txt"
,
"w"
)
as
text_file
:
text_file
.
write
(
'
\n
'
.
join
([
f
'{class2int[name]}
\t\t
{name}'
for
name
in
list
(
set
(
lst_labels
))]))
else
:
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
,
lst_labels
)),
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
,
'label_golden'
])
df
.
to_pickle
(
args
.
df_cases_file
)
# search for authors in end-sections in order to add them as labels to the df_cases file:
try
:
add_author_labels_to_df_cases
(
args
.
target_folder_path
+
'/end'
,
args
.
author_names
.
split
(
' '
),
args
.
df_cases_file
)
except
:
print
(
"label the reports with authors failed."
)
if
args
.
label_in_filename_symbol
==
''
:
try
:
add_author_labels_to_df_cases
(
args
.
target_folder_path
+
'/end'
,
args
.
author_names
.
split
(
' '
),
args
.
df_cases_file
)
except
:
print
(
"label the reports with authors failed."
)
df
=
pd
.
read_pickle
(
args
.
df_cases_file
)
print
(
f
"saved df_cases at {args.df_cases_file}
\n
"
)
print
(
f
'it looks like this:'
)
print
(
df
)
print
(
f
"first element of {args.df_cases_file}:
\n
{df.iloc[0]}"
)
print
()
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment