from database_preparation.utils_stringpreparation import read_german_text import re def get3parts(t_file): #%% load the text t_text = read_german_text(t_file) #%% define codon-find function def find_codon(text, word_list): codon = 'XENOTARSOSAURUS' for i_word in word_list: if text.find(i_word) > -1: codon = i_word return codon #%% get start codons for description and diagnosis # thinking that one pathologist sticks to his/her wording start_codon_description = find_codon(t_text, ['Lichtmikroskopie:', 'Mikroskopie:', "Histologie:", "Klinische Angaben:", "Wir erhielten ", "Eingesandt wurde:"]) start_codon_2nd = find_codon(t_text, ["Nachbericht", "Immunhistochemie"]) start_codon_conclusion = find_codon(t_text, ["Beurteilung:", "Begutachtung:"]) # Vorläufige Beurteilung gemäß der Gefrierschnittführung: .... Beurteilung am Paraffinmaterial: start_codon_comment = find_codon(t_text, ["Kommentar"]) if t_text.find("Mit freundlichen") > -1: start_codon_greetings = "Mit freundlichen" else: start_codon_greetings = "Prof." #%% set the stop codons and prepare the function # like on DNA, the next start codon is a stop codon stop_codon_list = [start_codon_conclusion, start_codon_description, start_codon_comment, start_codon_greetings, start_codon_2nd] from database_preparation.utils_stringpreparation import regexp def get_codon_idx(text, start_codon, stop_codon_list): _, idx_start = regexp(start_codon, text) idx_stop = [] for i_idx_start in idx_start: idx_stop_list = [] for i_stop_codon in stop_codon_list: if not i_stop_codon == start_codon: idx_stop_list.append(text[i_idx_start:].find(i_stop_codon)) idx_stop_list = [item for item in idx_stop_list if item >= 0] idx_stop.append(min(idx_stop_list) + i_idx_start) return idx_start, idx_stop #%% find the indices for the text-frames start_description, stop_description = get_codon_idx(t_text, start_codon_description, stop_codon_list) start_2nd, stop_2nd = get_codon_idx(t_text, start_codon_2nd, stop_codon_list) start_conclusion, stop_clonclusion = get_codon_idx(t_text, start_codon_conclusion, stop_codon_list) #%% get the text parts def get_text_frame(idx_start_list, idx_stop_list, text): t_frame = [] for i in range(0, len(idx_start_list)): t_frame.append(text[idx_start_list[i]:idx_stop_list[i]]) return t_frame txt_micro = get_text_frame(start_description,stop_description, t_text) txt_2nd = get_text_frame(start_2nd, stop_2nd, t_text) txt_conclusion= get_text_frame(start_conclusion,stop_clonclusion, t_text) #%% finalise the text txt_conclusion = txt_conclusion[-1] if not txt_2nd == []: txt_micro = str(txt_micro[0]) + str(txt_2nd[0]) else: txt_micro = str(txt_micro[0]) # get greetings-section: start_greedingsindex = t_text.find(start_codon_greetings) txt_greetings = None if start_greedingsindex != -1: txt_greetings = t_text[start_greedingsindex:] return txt_micro, txt_conclusion, txt_greetings