extendet search phrases for split_reports.py.

split_reports.py works now also with labeled datasets.

extendet search phrases for split_reports.py.
split_reports.py works now also with labeled datasets.
8ed471a1 · max · 94aab3d3 · 8ed471a1 · 8ed471a1
Commit 8ed471a1 authored Sep 26, 2022 by max
Hide whitespace changes
Inline Side-by-side

Showing with 52 additions and 18 deletions

reportPreparationTools.py database_preparation/reportPreparationTools.py +3 -4

split_reports.py database_preparation/split_reports.py +49 -14

No files found.
--- a/database_preparation/reportPreparationTools.py
+++ b/database_preparation/reportPreparationTools.py
@@ -19,13 +19,14 @@ def get3parts(t_file):
    #%% get start codons for description and diagnosis
    # thinking that one pathologist sticks to his/her wording
    start_codon_description = find_codon(t_text,
-                                         ['Lichtmikroskopie:', 'Mikroskopie:',"Histologie:"])
+                                         ['Lichtmikroskopie:', 'Mikroskopie:', "Histologie:", "Klinische Angaben:", "Wir erhielten"])
    start_codon_2nd = find_codon(t_text,
                                 ["Nachbericht", "Immunhistochemie"])
    start_codon_conclusion = find_codon(t_text,
                                        ["Beurteilung:", "Begutachtung:"])
+    # Vorläufige Beurteilung gemäß der Gefrierschnittführung: .... Beurteilung am Paraffinmaterial:
    start_codon_comment = find_codon(t_text,
                                        ["Kommentar"])
@@ -94,10 +95,8 @@ def get3parts(t_file):
        txt_micro = str(txt_micro[0])
    # get greetings-section:
-    start_greedingsindex = t_text.find("Mit freundlichen")
+    start_greedingsindex = t_text.find(start_codon_greetings)
    txt_greetings = None
-    if start_greedingsindex == -1:
-        start_greedingsindex = t_text.find("Prof.")
    if start_greedingsindex != -1:
        txt_greetings = t_text[start_greedingsindex:]

--- a/database_preparation/split_reports.py
+++ b/database_preparation/split_reports.py
@@ -2,13 +2,17 @@
 '''
 this script splits the reports (.txt files) located in the
 --path_to_reports folder into description-, diagnosis- and end-section
-and saves these parts in --target_folder_path (as -txt files).
+and saves these parts in --target_folder_path.
 In addition, a pandas dataframe (--df_cases_file) is then generated in
 which it is saved which 3 report-sections belong together. The df_cases
 dataframe is also there to label the reports with different labelsets.
 Also pass --author_names (space separated names, cased) to label the reports by found
 authors (stored in df_cases)
+The passed reports can also be labeled.
+The label of each report is expected to be in the title/name of the .txt file,
+located after the start symbol "--label_in_filename_symbol" until .txt
 '''
 import sys
@@ -35,20 +39,30 @@ parser.add_argument("--author_names",
 parser.add_argument("--text_encoding",
                        default="latin-1")
 parser.add_argument("--use_newest_reports", action='store_true')
+parser.add_argument("--label_in_filename_symbol",
+                        default='')
 args = parser.parse_args()
 # %% get all files
 # get the primary reports, which histonums are ending wit .0:
 report_file_list = glob.glob(args.path_to_reports + '/*0.txt')
-# get all last reports (newest ones)
+if len(report_file_list) <= 0:
-if args.use_newest_reports:
+    report_file_list = glob.glob(args.path_to_reports + '/*.txt')
-    print("using newest reports (if there is a H.123.0 and a H.123.2, take H.123.2)")
+    if len(report_file_list) == 0:
-    for idx in tqdm(range(0, len(report_file_list))):
+        print(f"ERROR: Cant find .txt files in {args.path_to_reports}.")
-        t_file_name = report_file_list[idx]
+        exit(1)
-        report_file_list[idx] = glob.glob(t_file_name[0:-5] + "*.txt")[-1]
+    print(f'Using all reports, located in {args.path_to_reports}')
 else:
-    print("using oldes reports (if there is a H.123.0 and a H.123.2, take H.123.0)")
+    # get all last reports (newest ones)
+    if args.use_newest_reports:
+        print("using newest reports (if there is a H.123.0 and a H.123.2, take H.123.2)")
+        for idx in tqdm(range(0, len(report_file_list))):
+            t_file_name = report_file_list[idx]
+            report_file_list[idx] = glob.glob(t_file_name[0:-5] + "*.txt")[-1]
+    else:
+        print("using oldes reports (if there is a H.123.0 and a H.123.2, take H.123.0)")
@@ -60,14 +74,14 @@ def make_folder(dir):
 make_folder(args.target_folder_path + "/description")
 make_folder(args.target_folder_path + "/diagnosis")
 make_folder(args.target_folder_path + "/end")
-make_folder(args.target_folder_path + "/short_diagnosis")
+make_folder(args.target_folder_path + "/label")
 print(f"Splitting reports of corpus {args.path_to_reports} into description-, diagnosis- and end-sections...")
 # %% iterate over all files
 error_file_list = []
 no_error_file_list = []
-lst_description, lst_diagnose, lst_end = [], [], []
+lst_description, lst_diagnose, lst_end, lst_labels = [], [], [], []
 for idx, t_file in tqdm(enumerate(report_file_list)):
    try:
@@ -93,10 +107,25 @@ for idx, t_file in tqdm(enumerate(report_file_list)):
            end = "None"
        text_file.write(end)
+    if args.label_in_filename_symbol != '':
+        try:
+            a = t_file.index('#') + 1
+            b = t_file.index('.txt')
+            label = t_file[a:b]
+        except:
+            label = 'None'
+        with open(args.target_folder_path + "/label/label#" + str(idx) + ".txt", "w",
+                  encoding=args.text_encoding) as text_file:
+            text_file.write(label)
+        lst_labels.append('label#' + str(idx) + ".txt")
    lst_description.append('description#' + str(idx) + ".txt")
    lst_diagnose.append('diagnosis#' + str(idx) + ".txt")
    lst_end.append('end#' + str(idx) + ".txt")
 #save skipped reports:
 with open(args.target_folder_path + "/failed_to_split_list.txt", "w") as text_file:
    text_file.write('\n'.join(error_file_list))
@@ -108,13 +137,17 @@ print(f"skipped {len(error_file_list)} reports "
      f"failed (see {args.target_folder_path + '/failed_to_split_list.txt'}).")
 processed_docs = len(lst_description)
-print(f"saved {processed_docs} description sections at {args.target_folder_path + '/description'}")
+#print(f"saved {processed_docs} description sections at {args.target_folder_path + '/description'}")
-print(f"saved {processed_docs} diagnosis sections at {args.target_folder_path + '/diagnosis'}")
+#print(f"saved {processed_docs} diagnosis sections at {args.target_folder_path + '/diagnosis'}")
-print(f"saved {processed_docs} end sections at {args.target_folder_path + '/end'}")
+#print(f"saved {processed_docs} end sections at {args.target_folder_path + '/end'}")
 # create and save df_cases file, which is also used to save the labels (=cluster-indices) for each text.
-df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end)),
+if args.label_in_filename_symbol == '':
+    df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end)),
                  columns=['description_text_files', 'diagnosis_text_files', 'end_text_files'])
+else:
+    df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end, lst_labels)),
+                      columns=['description_text_files', 'diagnosis_text_files', 'end_text_files', 'label_files'])
 df.to_pickle(args.df_cases_file)
 # search for authors in end-sections in order to add them as labels to the df_cases file:
@@ -125,4 +158,6 @@ except:
 df = pd.read_pickle(args.df_cases_file)
 print(f"saved df_cases at {args.df_cases_file}\n")
+print(f'it looks like this:')
+print(df)
 print()