Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
NLP in diagnostic texts from nephropathology
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Maximilian Legnar
NLP in diagnostic texts from nephropathology
Commits
8ed471a1
Commit
8ed471a1
authored
2 years ago
by
max
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
extendet search phrases for split_reports.py.
split_reports.py works now also with labeled datasets.
parent
94aab3d3
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
52 additions
and
18 deletions
+52
-18
reportPreparationTools.py
database_preparation/reportPreparationTools.py
+3
-4
split_reports.py
database_preparation/split_reports.py
+49
-14
No files found.
database_preparation/reportPreparationTools.py
View file @
8ed471a1
...
@@ -19,13 +19,14 @@ def get3parts(t_file):
...
@@ -19,13 +19,14 @@ def get3parts(t_file):
#%% get start codons for description and diagnosis
#%% get start codons for description and diagnosis
# thinking that one pathologist sticks to his/her wording
# thinking that one pathologist sticks to his/her wording
start_codon_description
=
find_codon
(
t_text
,
start_codon_description
=
find_codon
(
t_text
,
[
'Lichtmikroskopie:'
,
'Mikroskopie:'
,
"Histologie:
"
])
[
'Lichtmikroskopie:'
,
'Mikroskopie:'
,
"Histologie:"
,
"Klinische Angaben:"
,
"Wir erhielten
"
])
start_codon_2nd
=
find_codon
(
t_text
,
start_codon_2nd
=
find_codon
(
t_text
,
[
"Nachbericht"
,
"Immunhistochemie"
])
[
"Nachbericht"
,
"Immunhistochemie"
])
start_codon_conclusion
=
find_codon
(
t_text
,
start_codon_conclusion
=
find_codon
(
t_text
,
[
"Beurteilung:"
,
"Begutachtung:"
])
[
"Beurteilung:"
,
"Begutachtung:"
])
# Vorläufige Beurteilung gemäß der Gefrierschnittführung: .... Beurteilung am Paraffinmaterial:
start_codon_comment
=
find_codon
(
t_text
,
start_codon_comment
=
find_codon
(
t_text
,
[
"Kommentar"
])
[
"Kommentar"
])
...
@@ -94,10 +95,8 @@ def get3parts(t_file):
...
@@ -94,10 +95,8 @@ def get3parts(t_file):
txt_micro
=
str
(
txt_micro
[
0
])
txt_micro
=
str
(
txt_micro
[
0
])
# get greetings-section:
# get greetings-section:
start_greedingsindex
=
t_text
.
find
(
"Mit freundlichen"
)
start_greedingsindex
=
t_text
.
find
(
start_codon_greetings
)
txt_greetings
=
None
txt_greetings
=
None
if
start_greedingsindex
==
-
1
:
start_greedingsindex
=
t_text
.
find
(
"Prof."
)
if
start_greedingsindex
!=
-
1
:
if
start_greedingsindex
!=
-
1
:
txt_greetings
=
t_text
[
start_greedingsindex
:]
txt_greetings
=
t_text
[
start_greedingsindex
:]
...
...
This diff is collapsed.
Click to expand it.
database_preparation/split_reports.py
View file @
8ed471a1
...
@@ -2,13 +2,17 @@
...
@@ -2,13 +2,17 @@
'''
'''
this script splits the reports (.txt files) located in the
this script splits the reports (.txt files) located in the
--path_to_reports folder into description-, diagnosis- and end-section
--path_to_reports folder into description-, diagnosis- and end-section
and saves these parts in --target_folder_path
(as -txt files)
.
and saves these parts in --target_folder_path.
In addition, a pandas dataframe (--df_cases_file) is then generated in
In addition, a pandas dataframe (--df_cases_file) is then generated in
which it is saved which 3 report-sections belong together. The df_cases
which it is saved which 3 report-sections belong together. The df_cases
dataframe is also there to label the reports with different labelsets.
dataframe is also there to label the reports with different labelsets.
Also pass --author_names (space separated names, cased) to label the reports by found
Also pass --author_names (space separated names, cased) to label the reports by found
authors (stored in df_cases)
authors (stored in df_cases)
The passed reports can also be labeled.
The label of each report is expected to be in the title/name of the .txt file,
located after the start symbol "--label_in_filename_symbol" until .txt
'''
'''
import
sys
import
sys
...
@@ -35,19 +39,29 @@ parser.add_argument("--author_names",
...
@@ -35,19 +39,29 @@ parser.add_argument("--author_names",
parser
.
add_argument
(
"--text_encoding"
,
parser
.
add_argument
(
"--text_encoding"
,
default
=
"latin-1"
)
default
=
"latin-1"
)
parser
.
add_argument
(
"--use_newest_reports"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--use_newest_reports"
,
action
=
'store_true'
)
parser
.
add_argument
(
"--label_in_filename_symbol"
,
default
=
''
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# %% get all files
# %% get all files
# get the primary reports, which histonums are ending wit .0:
# get the primary reports, which histonums are ending wit .0:
report_file_list
=
glob
.
glob
(
args
.
path_to_reports
+
'/*0.txt'
)
report_file_list
=
glob
.
glob
(
args
.
path_to_reports
+
'/*0.txt'
)
# get all last reports (newest ones)
if
len
(
report_file_list
)
<=
0
:
if
args
.
use_newest_reports
:
report_file_list
=
glob
.
glob
(
args
.
path_to_reports
+
'/*.txt'
)
if
len
(
report_file_list
)
==
0
:
print
(
f
"ERROR: Cant find .txt files in {args.path_to_reports}."
)
exit
(
1
)
print
(
f
'Using all reports, located in {args.path_to_reports}'
)
else
:
# get all last reports (newest ones)
if
args
.
use_newest_reports
:
print
(
"using newest reports (if there is a H.123.0 and a H.123.2, take H.123.2)"
)
print
(
"using newest reports (if there is a H.123.0 and a H.123.2, take H.123.2)"
)
for
idx
in
tqdm
(
range
(
0
,
len
(
report_file_list
))):
for
idx
in
tqdm
(
range
(
0
,
len
(
report_file_list
))):
t_file_name
=
report_file_list
[
idx
]
t_file_name
=
report_file_list
[
idx
]
report_file_list
[
idx
]
=
glob
.
glob
(
t_file_name
[
0
:
-
5
]
+
"*.txt"
)[
-
1
]
report_file_list
[
idx
]
=
glob
.
glob
(
t_file_name
[
0
:
-
5
]
+
"*.txt"
)[
-
1
]
else
:
else
:
print
(
"using oldes reports (if there is a H.123.0 and a H.123.2, take H.123.0)"
)
print
(
"using oldes reports (if there is a H.123.0 and a H.123.2, take H.123.0)"
)
...
@@ -60,14 +74,14 @@ def make_folder(dir):
...
@@ -60,14 +74,14 @@ def make_folder(dir):
make_folder
(
args
.
target_folder_path
+
"/description"
)
make_folder
(
args
.
target_folder_path
+
"/description"
)
make_folder
(
args
.
target_folder_path
+
"/diagnosis"
)
make_folder
(
args
.
target_folder_path
+
"/diagnosis"
)
make_folder
(
args
.
target_folder_path
+
"/end"
)
make_folder
(
args
.
target_folder_path
+
"/end"
)
make_folder
(
args
.
target_folder_path
+
"/
short_diagnosis
"
)
make_folder
(
args
.
target_folder_path
+
"/
label
"
)
print
(
f
"Splitting reports of corpus {args.path_to_reports} into description-, diagnosis- and end-sections..."
)
print
(
f
"Splitting reports of corpus {args.path_to_reports} into description-, diagnosis- and end-sections..."
)
# %% iterate over all files
# %% iterate over all files
error_file_list
=
[]
error_file_list
=
[]
no_error_file_list
=
[]
no_error_file_list
=
[]
lst_description
,
lst_diagnose
,
lst_end
=
[],
[],
[]
lst_description
,
lst_diagnose
,
lst_end
,
lst_labels
=
[],
[],
[],
[]
for
idx
,
t_file
in
tqdm
(
enumerate
(
report_file_list
)):
for
idx
,
t_file
in
tqdm
(
enumerate
(
report_file_list
)):
try
:
try
:
...
@@ -93,10 +107,25 @@ for idx, t_file in tqdm(enumerate(report_file_list)):
...
@@ -93,10 +107,25 @@ for idx, t_file in tqdm(enumerate(report_file_list)):
end
=
"None"
end
=
"None"
text_file
.
write
(
end
)
text_file
.
write
(
end
)
if
args
.
label_in_filename_symbol
!=
''
:
try
:
a
=
t_file
.
index
(
'#'
)
+
1
b
=
t_file
.
index
(
'.txt'
)
label
=
t_file
[
a
:
b
]
except
:
label
=
'None'
with
open
(
args
.
target_folder_path
+
"/label/label#"
+
str
(
idx
)
+
".txt"
,
"w"
,
encoding
=
args
.
text_encoding
)
as
text_file
:
text_file
.
write
(
label
)
lst_labels
.
append
(
'label#'
+
str
(
idx
)
+
".txt"
)
lst_description
.
append
(
'description#'
+
str
(
idx
)
+
".txt"
)
lst_description
.
append
(
'description#'
+
str
(
idx
)
+
".txt"
)
lst_diagnose
.
append
(
'diagnosis#'
+
str
(
idx
)
+
".txt"
)
lst_diagnose
.
append
(
'diagnosis#'
+
str
(
idx
)
+
".txt"
)
lst_end
.
append
(
'end#'
+
str
(
idx
)
+
".txt"
)
lst_end
.
append
(
'end#'
+
str
(
idx
)
+
".txt"
)
#save skipped reports:
#save skipped reports:
with
open
(
args
.
target_folder_path
+
"/failed_to_split_list.txt"
,
"w"
)
as
text_file
:
with
open
(
args
.
target_folder_path
+
"/failed_to_split_list.txt"
,
"w"
)
as
text_file
:
text_file
.
write
(
'
\n
'
.
join
(
error_file_list
))
text_file
.
write
(
'
\n
'
.
join
(
error_file_list
))
...
@@ -108,13 +137,17 @@ print(f"skipped {len(error_file_list)} reports "
...
@@ -108,13 +137,17 @@ print(f"skipped {len(error_file_list)} reports "
f
"failed (see {args.target_folder_path + '/failed_to_split_list.txt'})."
)
f
"failed (see {args.target_folder_path + '/failed_to_split_list.txt'})."
)
processed_docs
=
len
(
lst_description
)
processed_docs
=
len
(
lst_description
)
print
(
f
"saved {processed_docs} description sections at {args.target_folder_path + '/description'}"
)
#
print(f"saved {processed_docs} description sections at {args.target_folder_path + '/description'}")
print
(
f
"saved {processed_docs} diagnosis sections at {args.target_folder_path + '/diagnosis'}"
)
#
print(f"saved {processed_docs} diagnosis sections at {args.target_folder_path + '/diagnosis'}")
print
(
f
"saved {processed_docs} end sections at {args.target_folder_path + '/end'}"
)
#
print(f"saved {processed_docs} end sections at {args.target_folder_path + '/end'}")
# create and save df_cases file, which is also used to save the labels (=cluster-indices) for each text.
# create and save df_cases file, which is also used to save the labels (=cluster-indices) for each text.
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
)),
if
args
.
label_in_filename_symbol
==
''
:
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
)),
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
])
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
])
else
:
df
=
pd
.
DataFrame
(
list
(
zip
(
lst_description
,
lst_diagnose
,
lst_end
,
lst_labels
)),
columns
=
[
'description_text_files'
,
'diagnosis_text_files'
,
'end_text_files'
,
'label_files'
])
df
.
to_pickle
(
args
.
df_cases_file
)
df
.
to_pickle
(
args
.
df_cases_file
)
# search for authors in end-sections in order to add them as labels to the df_cases file:
# search for authors in end-sections in order to add them as labels to the df_cases file:
...
@@ -125,4 +158,6 @@ except:
...
@@ -125,4 +158,6 @@ except:
df
=
pd
.
read_pickle
(
args
.
df_cases_file
)
df
=
pd
.
read_pickle
(
args
.
df_cases_file
)
print
(
f
"saved df_cases at {args.df_cases_file}
\n
"
)
print
(
f
"saved df_cases at {args.df_cases_file}
\n
"
)
print
(
f
'it looks like this:'
)
print
(
df
)
print
()
print
()
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment