Commit 8ed471a1 authored by max's avatar max

extendet search phrases for split_reports.py.

split_reports.py works now also with labeled datasets.
parent 94aab3d3
...@@ -19,13 +19,14 @@ def get3parts(t_file): ...@@ -19,13 +19,14 @@ def get3parts(t_file):
#%% get start codons for description and diagnosis #%% get start codons for description and diagnosis
# thinking that one pathologist sticks to his/her wording # thinking that one pathologist sticks to his/her wording
start_codon_description = find_codon(t_text, start_codon_description = find_codon(t_text,
['Lichtmikroskopie:', 'Mikroskopie:',"Histologie:"]) ['Lichtmikroskopie:', 'Mikroskopie:', "Histologie:", "Klinische Angaben:", "Wir erhielten"])
start_codon_2nd = find_codon(t_text, start_codon_2nd = find_codon(t_text,
["Nachbericht", "Immunhistochemie"]) ["Nachbericht", "Immunhistochemie"])
start_codon_conclusion = find_codon(t_text, start_codon_conclusion = find_codon(t_text,
["Beurteilung:", "Begutachtung:"]) ["Beurteilung:", "Begutachtung:"])
# Vorläufige Beurteilung gemäß der Gefrierschnittführung: .... Beurteilung am Paraffinmaterial:
start_codon_comment = find_codon(t_text, start_codon_comment = find_codon(t_text,
["Kommentar"]) ["Kommentar"])
...@@ -94,10 +95,8 @@ def get3parts(t_file): ...@@ -94,10 +95,8 @@ def get3parts(t_file):
txt_micro = str(txt_micro[0]) txt_micro = str(txt_micro[0])
# get greetings-section: # get greetings-section:
start_greedingsindex = t_text.find("Mit freundlichen") start_greedingsindex = t_text.find(start_codon_greetings)
txt_greetings = None txt_greetings = None
if start_greedingsindex == -1:
start_greedingsindex = t_text.find("Prof.")
if start_greedingsindex != -1: if start_greedingsindex != -1:
txt_greetings = t_text[start_greedingsindex:] txt_greetings = t_text[start_greedingsindex:]
......
...@@ -2,13 +2,17 @@ ...@@ -2,13 +2,17 @@
''' '''
this script splits the reports (.txt files) located in the this script splits the reports (.txt files) located in the
--path_to_reports folder into description-, diagnosis- and end-section --path_to_reports folder into description-, diagnosis- and end-section
and saves these parts in --target_folder_path (as -txt files). and saves these parts in --target_folder_path.
In addition, a pandas dataframe (--df_cases_file) is then generated in In addition, a pandas dataframe (--df_cases_file) is then generated in
which it is saved which 3 report-sections belong together. The df_cases which it is saved which 3 report-sections belong together. The df_cases
dataframe is also there to label the reports with different labelsets. dataframe is also there to label the reports with different labelsets.
Also pass --author_names (space separated names, cased) to label the reports by found Also pass --author_names (space separated names, cased) to label the reports by found
authors (stored in df_cases) authors (stored in df_cases)
The passed reports can also be labeled.
The label of each report is expected to be in the title/name of the .txt file,
located after the start symbol "--label_in_filename_symbol" until .txt
''' '''
import sys import sys
...@@ -35,20 +39,30 @@ parser.add_argument("--author_names", ...@@ -35,20 +39,30 @@ parser.add_argument("--author_names",
parser.add_argument("--text_encoding", parser.add_argument("--text_encoding",
default="latin-1") default="latin-1")
parser.add_argument("--use_newest_reports", action='store_true') parser.add_argument("--use_newest_reports", action='store_true')
parser.add_argument("--label_in_filename_symbol",
default='')
args = parser.parse_args() args = parser.parse_args()
# %% get all files # %% get all files
# get the primary reports, which histonums are ending wit .0: # get the primary reports, which histonums are ending wit .0:
report_file_list = glob.glob(args.path_to_reports + '/*0.txt') report_file_list = glob.glob(args.path_to_reports + '/*0.txt')
# get all last reports (newest ones) if len(report_file_list) <= 0:
if args.use_newest_reports: report_file_list = glob.glob(args.path_to_reports + '/*.txt')
print("using newest reports (if there is a H.123.0 and a H.123.2, take H.123.2)") if len(report_file_list) == 0:
for idx in tqdm(range(0, len(report_file_list))): print(f"ERROR: Cant find .txt files in {args.path_to_reports}.")
t_file_name = report_file_list[idx] exit(1)
report_file_list[idx] = glob.glob(t_file_name[0:-5] + "*.txt")[-1] print(f'Using all reports, located in {args.path_to_reports}')
else: else:
print("using oldes reports (if there is a H.123.0 and a H.123.2, take H.123.0)") # get all last reports (newest ones)
if args.use_newest_reports:
print("using newest reports (if there is a H.123.0 and a H.123.2, take H.123.2)")
for idx in tqdm(range(0, len(report_file_list))):
t_file_name = report_file_list[idx]
report_file_list[idx] = glob.glob(t_file_name[0:-5] + "*.txt")[-1]
else:
print("using oldes reports (if there is a H.123.0 and a H.123.2, take H.123.0)")
...@@ -60,14 +74,14 @@ def make_folder(dir): ...@@ -60,14 +74,14 @@ def make_folder(dir):
make_folder(args.target_folder_path + "/description") make_folder(args.target_folder_path + "/description")
make_folder(args.target_folder_path + "/diagnosis") make_folder(args.target_folder_path + "/diagnosis")
make_folder(args.target_folder_path + "/end") make_folder(args.target_folder_path + "/end")
make_folder(args.target_folder_path + "/short_diagnosis") make_folder(args.target_folder_path + "/label")
print(f"Splitting reports of corpus {args.path_to_reports} into description-, diagnosis- and end-sections...") print(f"Splitting reports of corpus {args.path_to_reports} into description-, diagnosis- and end-sections...")
# %% iterate over all files # %% iterate over all files
error_file_list = [] error_file_list = []
no_error_file_list = [] no_error_file_list = []
lst_description, lst_diagnose, lst_end = [], [], [] lst_description, lst_diagnose, lst_end, lst_labels = [], [], [], []
for idx, t_file in tqdm(enumerate(report_file_list)): for idx, t_file in tqdm(enumerate(report_file_list)):
try: try:
...@@ -93,10 +107,25 @@ for idx, t_file in tqdm(enumerate(report_file_list)): ...@@ -93,10 +107,25 @@ for idx, t_file in tqdm(enumerate(report_file_list)):
end = "None" end = "None"
text_file.write(end) text_file.write(end)
if args.label_in_filename_symbol != '':
try:
a = t_file.index('#') + 1
b = t_file.index('.txt')
label = t_file[a:b]
except:
label = 'None'
with open(args.target_folder_path + "/label/label#" + str(idx) + ".txt", "w",
encoding=args.text_encoding) as text_file:
text_file.write(label)
lst_labels.append('label#' + str(idx) + ".txt")
lst_description.append('description#' + str(idx) + ".txt") lst_description.append('description#' + str(idx) + ".txt")
lst_diagnose.append('diagnosis#' + str(idx) + ".txt") lst_diagnose.append('diagnosis#' + str(idx) + ".txt")
lst_end.append('end#' + str(idx) + ".txt") lst_end.append('end#' + str(idx) + ".txt")
#save skipped reports: #save skipped reports:
with open(args.target_folder_path + "/failed_to_split_list.txt", "w") as text_file: with open(args.target_folder_path + "/failed_to_split_list.txt", "w") as text_file:
text_file.write('\n'.join(error_file_list)) text_file.write('\n'.join(error_file_list))
...@@ -108,13 +137,17 @@ print(f"skipped {len(error_file_list)} reports " ...@@ -108,13 +137,17 @@ print(f"skipped {len(error_file_list)} reports "
f"failed (see {args.target_folder_path + '/failed_to_split_list.txt'}).") f"failed (see {args.target_folder_path + '/failed_to_split_list.txt'}).")
processed_docs = len(lst_description) processed_docs = len(lst_description)
print(f"saved {processed_docs} description sections at {args.target_folder_path + '/description'}") #print(f"saved {processed_docs} description sections at {args.target_folder_path + '/description'}")
print(f"saved {processed_docs} diagnosis sections at {args.target_folder_path + '/diagnosis'}") #print(f"saved {processed_docs} diagnosis sections at {args.target_folder_path + '/diagnosis'}")
print(f"saved {processed_docs} end sections at {args.target_folder_path + '/end'}") #print(f"saved {processed_docs} end sections at {args.target_folder_path + '/end'}")
# create and save df_cases file, which is also used to save the labels (=cluster-indices) for each text. # create and save df_cases file, which is also used to save the labels (=cluster-indices) for each text.
df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end)), if args.label_in_filename_symbol == '':
df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end)),
columns=['description_text_files', 'diagnosis_text_files', 'end_text_files']) columns=['description_text_files', 'diagnosis_text_files', 'end_text_files'])
else:
df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end, lst_labels)),
columns=['description_text_files', 'diagnosis_text_files', 'end_text_files', 'label_files'])
df.to_pickle(args.df_cases_file) df.to_pickle(args.df_cases_file)
# search for authors in end-sections in order to add them as labels to the df_cases file: # search for authors in end-sections in order to add them as labels to the df_cases file:
...@@ -125,4 +158,6 @@ except: ...@@ -125,4 +158,6 @@ except:
df = pd.read_pickle(args.df_cases_file) df = pd.read_pickle(args.df_cases_file)
print(f"saved df_cases at {args.df_cases_file}\n") print(f"saved df_cases at {args.df_cases_file}\n")
print(f'it looks like this:')
print(df)
print() print()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment