Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
NLP in diagnostic texts from nephropathology
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Maximilian Legnar
NLP in diagnostic texts from nephropathology
Commits
8ed471a1
Commit
8ed471a1
authored
Sep 26, 2022
by
max
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
extendet search phrases for split_reports.py.
split_reports.py works now also with labeled datasets.
parent
94aab3d3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
52 additions
and
18 deletions
+52
-18
reportPreparationTools.py
database_preparation/reportPreparationTools.py
+3
-4
split_reports.py
database_preparation/split_reports.py
+49
-14
No files found.
database_preparation/reportPreparationTools.py
View file @
8ed471a1
...
@@ -19,13 +19,14 @@ def get3parts(t_file):
...
@@ -19,13 +19,14 @@ def get3parts(t_file):
#%% get start codons for description and diagnosis
#%% get start codons for description and diagnosis
# thinking that one pathologist sticks to his/her wording
# thinking that one pathologist sticks to his/her wording
start_codon_description
=
find_codon
(
t_text
,
start_codon_description
=
find_codon
(
t_text
,
[
'Lichtmikroskopie:'
,
'Mikroskopie:'
,
"Histologie:
"
])
[
'Lichtmikroskopie:'
,
'Mikroskopie:'
,
"Histologie:"
,
"Klinische Angaben:"
,
"Wir erhielten
"
])
start_codon_2nd
=
find_codon
(
t_text
,
start_codon_2nd
=
find_codon
(
t_text
,
[
"Nachbericht"
,
"Immunhistochemie"
])
[
"Nachbericht"
,
"Immunhistochemie"
])
start_codon_conclusion
=
find_codon
(
t_text
,
start_codon_conclusion
=
find_codon
(
t_text
,
[
"Beurteilung:"
,
"Begutachtung:"
])
[
"Beurteilung:"
,
"Begutachtung:"
])
# Vorläufige Beurteilung gemäß der Gefrierschnittführung: .... Beurteilung am Paraffinmaterial:
start_codon_comment
=
find_codon
(
t_text
,
start_codon_comment
=
find_codon
(
t_text
,
[
"Kommentar"
])
[
"Kommentar"
])
...
@@ -94,10 +95,8 @@ def get3parts(t_file):
...
@@ -94,10 +95,8 @@ def get3parts(t_file):
txt_micro
=
str
(
txt_micro
[
0
])
txt_micro
=
str
(
txt_micro
[
0
])
# get greetings-section:
# get greetings-section:
start_greedingsindex
=
t_text
.
find
(
"Mit freundlichen"
)
start_greedingsindex
=
t_text
.
find
(
start_codon_greetings
)
txt_greetings
=
None
txt_greetings
=
None
if
start_greedingsindex
==
-
1
:
start_greedingsindex
=
t_text
.
find
(
"Prof."
)
if
start_greedingsindex
!=
-
1
:
if
start_greedingsindex
!=
-
1
:
txt_greetings
=
t_text
[
start_greedingsindex
:]
txt_greetings
=
t_text
[
start_greedingsindex
:]
...
...
database_preparation/split_reports.py
View file @
8ed471a1
...
@@ -2,13 +2,17 @@
...
@@ -2,13 +2,17 @@
'''
'''
this script splits the reports (.txt files) located in the
this script splits the reports (.txt files) located in the
--path_to_reports folder into description-, diagnosis- and end-section
--path_to_reports folder into description-, diagnosis- and end-section
and saves these parts in --target_folder_path
(as -txt files)
.
and saves these parts in --target_folder_path.
In addition, a pandas dataframe (--df_cases_file) is then generated in
In addition, a pandas dataframe (--df_cases_file) is then generated in
which it is saved which 3 report-sections belong together. The df_cases
which it is saved which 3 report-sections belong together. The df_cases
dataframe is also there to label the reports with different labelsets.
dataframe is also there to label the reports with different labelsets.
Also pass --author_names (space separated names, cased) to label the reports by found
Also pass --author_names (space separated names, cased) to label the reports by found
authors (stored in df_cases)
authors (stored in df_cases)
The passed reports can also be labeled.
The label of each report is expected to be in the title/name of the .txt file,
located after the start symbol "--label_in_filename_symbol" until .txt
'''
'''
import
sys
import
sys
...
@@ -35,20 +39,30 @@ parser.add_argument("--author_names",
...
@@ -35,20 +39,30 @@ parser.add_argument("--author_names",
parser
.
add_argument
(
"--text_encoding"
,
parser
.
add_argument
(
"--text_encoding"
,
default
=
"latin-1"
)
default
=
"latin-1"
)
parser
.
add_argument
(
"--use_newest_reports"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--use_newest_reports"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--label_in_filename_symbol"
,
default
=
''
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# %% get all files
# %% get all files
# get the primary reports, which histonums are ending wit .0:
# get the primary reports, which histonums are ending wit .0:
report_file_list
=
glob
.
glob
(
args
.
path_to_reports
+
'/*0.txt'
)
report_file_list
=
glob
.
glob
(
args
.
path_to_reports
+
'/*0.txt'
)
# get all last reports (newest ones)
if
len
(
report_file_list
)
<=
0
:
if
args
.
use_newest_reports
:
report_file_list
=
glob
.
glob
(
args
.
path_to_reports
+
'/*.txt'
)
print
(
"using newest reports (if there is a H.123.0 and a H.123.2, take H.123.2)"
)
if
len
(
report_file_list
)
==
0
:
for
idx
in
tqdm
(
range
(
0
,
len
(
report_file_list
))):
print
(
f
"ERROR: Cant find .txt files in {args.path_to_reports}."
)
t_file_name
=
report_file_list
[
idx
]
exit
(
1
)
report_file_list
[
idx
]
=
glob
.
glob
(
t_file_name
[
0
:
-
5
]
+
"*.txt"
)[
-
1
]
print
(
f
'Using all reports, located in {args.path_to_reports}'
)
else
:
else
:
print
(
"using oldes reports (if there is a H.123.0 and a H.123.2, take H.123.0)"
)
# get all last reports (newest ones)
if
args
.
use_newest_reports
:
print
(
"using newest reports (if there is a H.123.0 and a H.123.2, take H.123.2)"
)
for
idx
in
tqdm
(
range
(
0
,
len
(
report_file_list
))):
t_file_name
=
report_file_list
[
idx
]
report_file_list
[
idx
]
=
glob
.
glob
(
t_file_name
[
0
:
-
5
]
+
"*.txt"
)[
-
1
]
else
:
print
(
"using oldes reports (if there is a H.123.0 and a H.123.2, take H.123.0)"
)
...
@@ -60,14 +74,14 @@ def make_folder(dir):
...
@@ -60,14 +74,14 @@ def make_folder(dir):
make_folder
(
args
.
target_folder_path
+
"/description"
)
make_folder
(
args
.
target_folder_path
+
"/description"
)
make_folder
(
args
.
target_folder_path
+
"/diagnosis"
)
make_folder
(
args
.
target_folder_path
+
"/diagnosis"
)
make_folder
(
args
.
target_folder_path
+
"/end"
)
make_folder
(
args
.
target_folder_path
+
"/end"
)
make_folder
(
args
.
target_folder_path
+
"/
short_diagnosis
"
)
make_folder
(
args
.
target_folder_path
+
"/
label
"
)
print
(
f
"Splitting reports of corpus {args.path_to_reports} into description-, diagnosis- and end-sections..."
)
print
(
f
"Splitting reports of corpus {args.path_to_reports} into description-, diagnosis- and end-sections..."
)
# %% iterate over all files
# %% iterate over all files
error_file_list
=
[]
error_file_list
=
[]
no_error_file_list
=
[]
no_error_file_list
=
[]
lst_description
,
lst_diagnose
,
lst_end
=
[],
[],
[]
lst_description
,
lst_diagnose
,
lst_end
,
lst_labels
=
[],
[],
[],
[]
for
idx
,
t_file
in
tqdm
(
enumerate
(
report_file_list
)):
for
idx
,
t_file
in
tqdm
(
enumerate
(
report_file_list
)):
try
:
try
:
...
@@ -93,10 +107,25 @@ for idx, t_file in tqdm(enumerate(report_file_list)):
...
@@ -93,10 +107,25 @@ for idx, t_file in tqdm(enumerate(report_file_list)):
end
=
"None"
end
=
"None"
text_file
.
write
(
end
)
text_file
.
write
(
end
)
if
args
.
label_in_filename_symbol
!=
''
:
try
:
a
=
t_file
.
index
(
'#'
)
+
1
b
=
t_file
.
index
(
'.txt'
)
label
=
t_file
[
a
:
b
]
except
:
label
=
'None'
with
open
(
args
.
target_folder_path
+
"/label/label#"
+
str
(
idx
)
+
".txt"
,
"w"
,
encoding
=
args
.
text_encoding
)
as
text_file
:
text_file
.
write
(
label
)
lst_labels
.
append
(
'label#'
+
str
(
idx
)
+
".txt"
)
lst_description
.
append
(
'description#'
+
str
(
idx
)
+
".txt"
)
lst_description
.
append
(
'description#'
+
str
(
idx
)
+
".txt"
)
lst_diagnose
.
append
(
'diagnosis#'
+
str
(
idx
)
+
".txt"
)
lst_diagnose
.
append
(
'diagnosis#'
+
str
(
idx
)
+
".txt"
)
lst_end
.
append
(
'end#'
+
str
(
idx
)
+
".txt"
)
lst_end
.
append
(
'end#'
+
str
(
idx
)
+
".txt"
)
#save skipped reports:
#save skipped reports:
with
open
(
args
.
target_folder_path
+
"/failed_to_split_list.txt"
,
"w"
)
as
text_file
:
with
open
(
args
.
target_folder_path
+
"/failed_to_split_list.txt"
,
"w"
)
as
text_file
:
text_file
.
write
(
'
\n
'
.
join
(
error_file_list
))
text_file
.
write
(
'
\n
'
.
join
(
error_file_list
))
...
@@ -108,13 +137,17 @@ print(f"skipped {len(error_file_list)} reports "
...
@@ -108,13 +137,17 @@ print(f"skipped {len(error_file_list)} reports "
f
"failed (see {args.target_folder_path + '/failed_to_split_list.txt'})."
)
f
"failed (see {args.target_folder_path + '/failed_to_split_list.txt'})."
)
processed_docs
=
len
(
lst_description
)
processed_docs
=
len
(
lst_description
)
print
(
f
"saved {processed_docs} description sections at {args.target_folder_path + '/description'}"
)
#
print(f"saved {processed_docs} description sections at {args.target_folder_path + '/description'}")
print
(
f
"saved {processed_docs} diagnosis sections at {args.target_folder_path + '/diagnosis'}"
)
#
print(f"saved {processed_docs} diagnosis sections at {args.target_folder_path + '/diagnosis'}")
print
(
f
"saved {processed_docs} end sections at {args.target_folder_path + '/end'}"
)
#
print(f"saved {processed_docs} end sections at {args.target_folder_path + '/end'}")
# create and save df_cases file, which is also used to save the labels (=cluster-indices) for each text.
# create and save df_cases file, which is also used to save the labels (=cluster-indices) for each text.
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
)),
if
args
.
label_in_filename_symbol
==
''
:
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
)),
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
])
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
])
else
:
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
,
lst_labels
)),
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
,
'label_files'
])
df
.
to_pickle
(
args
.
df_cases_file
)
df
.
to_pickle
(
args
.
df_cases_file
)
# search for authors in end-sections in order to add them as labels to the df_cases file:
# search for authors in end-sections in order to add them as labels to the df_cases file:
...
@@ -125,4 +158,6 @@ except:
...
@@ -125,4 +158,6 @@ except:
df
=
pd
.
read_pickle
(
args
.
df_cases_file
)
df
=
pd
.
read_pickle
(
args
.
df_cases_file
)
print
(
f
"saved df_cases at {args.df_cases_file}
\n
"
)
print
(
f
"saved df_cases at {args.df_cases_file}
\n
"
)
print
(
f
'it looks like this:'
)
print
(
df
)
print
()
print
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment