Commit 8ed471a1 authored by max's avatar max

extendet search phrases for split_reports.py.

split_reports.py works now also with labeled datasets.
parent 94aab3d3
......@@ -19,13 +19,14 @@ def get3parts(t_file):
#%% get start codons for description and diagnosis
# thinking that one pathologist sticks to his/her wording
start_codon_description = find_codon(t_text,
['Lichtmikroskopie:', 'Mikroskopie:',"Histologie:"])
['Lichtmikroskopie:', 'Mikroskopie:', "Histologie:", "Klinische Angaben:", "Wir erhielten"])
start_codon_2nd = find_codon(t_text,
["Nachbericht", "Immunhistochemie"])
start_codon_conclusion = find_codon(t_text,
["Beurteilung:", "Begutachtung:"])
# Vorläufige Beurteilung gemäß der Gefrierschnittführung: .... Beurteilung am Paraffinmaterial:
start_codon_comment = find_codon(t_text,
["Kommentar"])
......@@ -94,10 +95,8 @@ def get3parts(t_file):
txt_micro = str(txt_micro[0])
# get greetings-section:
start_greedingsindex = t_text.find("Mit freundlichen")
start_greedingsindex = t_text.find(start_codon_greetings)
txt_greetings = None
if start_greedingsindex == -1:
start_greedingsindex = t_text.find("Prof.")
if start_greedingsindex != -1:
txt_greetings = t_text[start_greedingsindex:]
......
......@@ -2,13 +2,17 @@
'''
this script splits the reports (.txt files) located in the
--path_to_reports folder into description-, diagnosis- and end-section
and saves these parts in --target_folder_path (as -txt files).
and saves these parts in --target_folder_path.
In addition, a pandas dataframe (--df_cases_file) is then generated in
which it is saved which 3 report-sections belong together. The df_cases
dataframe is also there to label the reports with different labelsets.
Also pass --author_names (space separated names, cased) to label the reports by found
authors (stored in df_cases)
The passed reports can also be labeled.
The label of each report is expected to be in the title/name of the .txt file,
located after the start symbol "--label_in_filename_symbol" until .txt
'''
import sys
......@@ -35,20 +39,30 @@ parser.add_argument("--author_names",
parser.add_argument("--text_encoding",
default="latin-1")
parser.add_argument("--use_newest_reports", action='store_true')
parser.add_argument("--label_in_filename_symbol",
default='')
args = parser.parse_args()
# %% get all files
# get the primary reports, which histonums are ending wit .0:
report_file_list = glob.glob(args.path_to_reports + '/*0.txt')
# get all last reports (newest ones)
if args.use_newest_reports:
print("using newest reports (if there is a H.123.0 and a H.123.2, take H.123.2)")
for idx in tqdm(range(0, len(report_file_list))):
t_file_name = report_file_list[idx]
report_file_list[idx] = glob.glob(t_file_name[0:-5] + "*.txt")[-1]
if len(report_file_list) <= 0:
report_file_list = glob.glob(args.path_to_reports + '/*.txt')
if len(report_file_list) == 0:
print(f"ERROR: Cant find .txt files in {args.path_to_reports}.")
exit(1)
print(f'Using all reports, located in {args.path_to_reports}')
else:
print("using oldes reports (if there is a H.123.0 and a H.123.2, take H.123.0)")
# get all last reports (newest ones)
if args.use_newest_reports:
print("using newest reports (if there is a H.123.0 and a H.123.2, take H.123.2)")
for idx in tqdm(range(0, len(report_file_list))):
t_file_name = report_file_list[idx]
report_file_list[idx] = glob.glob(t_file_name[0:-5] + "*.txt")[-1]
else:
print("using oldes reports (if there is a H.123.0 and a H.123.2, take H.123.0)")
......@@ -60,14 +74,14 @@ def make_folder(dir):
make_folder(args.target_folder_path + "/description")
make_folder(args.target_folder_path + "/diagnosis")
make_folder(args.target_folder_path + "/end")
make_folder(args.target_folder_path + "/short_diagnosis")
make_folder(args.target_folder_path + "/label")
print(f"Splitting reports of corpus {args.path_to_reports} into description-, diagnosis- and end-sections...")
# %% iterate over all files
error_file_list = []
no_error_file_list = []
lst_description, lst_diagnose, lst_end = [], [], []
lst_description, lst_diagnose, lst_end, lst_labels = [], [], [], []
for idx, t_file in tqdm(enumerate(report_file_list)):
try:
......@@ -93,10 +107,25 @@ for idx, t_file in tqdm(enumerate(report_file_list)):
end = "None"
text_file.write(end)
if args.label_in_filename_symbol != '':
try:
a = t_file.index('#') + 1
b = t_file.index('.txt')
label = t_file[a:b]
except:
label = 'None'
with open(args.target_folder_path + "/label/label#" + str(idx) + ".txt", "w",
encoding=args.text_encoding) as text_file:
text_file.write(label)
lst_labels.append('label#' + str(idx) + ".txt")
lst_description.append('description#' + str(idx) + ".txt")
lst_diagnose.append('diagnosis#' + str(idx) + ".txt")
lst_end.append('end#' + str(idx) + ".txt")
#save skipped reports:
with open(args.target_folder_path + "/failed_to_split_list.txt", "w") as text_file:
text_file.write('\n'.join(error_file_list))
......@@ -108,13 +137,17 @@ print(f"skipped {len(error_file_list)} reports "
f"failed (see {args.target_folder_path + '/failed_to_split_list.txt'}).")
processed_docs = len(lst_description)
print(f"saved {processed_docs} description sections at {args.target_folder_path + '/description'}")
print(f"saved {processed_docs} diagnosis sections at {args.target_folder_path + '/diagnosis'}")
print(f"saved {processed_docs} end sections at {args.target_folder_path + '/end'}")
#print(f"saved {processed_docs} description sections at {args.target_folder_path + '/description'}")
#print(f"saved {processed_docs} diagnosis sections at {args.target_folder_path + '/diagnosis'}")
#print(f"saved {processed_docs} end sections at {args.target_folder_path + '/end'}")
# create and save df_cases file, which is also used to save the labels (=cluster-indices) for each text.
df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end)),
if args.label_in_filename_symbol == '':
df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end)),
columns=['description_text_files', 'diagnosis_text_files', 'end_text_files'])
else:
df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end, lst_labels)),
columns=['description_text_files', 'diagnosis_text_files', 'end_text_files', 'label_files'])
df.to_pickle(args.df_cases_file)
# search for authors in end-sections in order to add them as labels to the df_cases file:
......@@ -125,4 +158,6 @@ except:
df = pd.read_pickle(args.df_cases_file)
print(f"saved df_cases at {args.df_cases_file}\n")
print(f'it looks like this:')
print(df)
print()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment