Commit c7bab612 authored by max's avatar max

Now split_reports.py and cluster plots should work better with golden labels

parent 8ed471a1
......@@ -14,10 +14,9 @@ plot_author_histos = False
cluster = 2
clustersets = ["HDBSCAN", "KMeans", "LDA", "GSDPMM",
"top2vec", "Patho_BERT", "German_BERT"]
"top2vec", "Patho_BERT", "German_BERT", "golden"]
df = pd.read_pickle(args.df_cases_file)
authors_labels = df["label_author"]
# plot histograms: how much docs do have the same label=cluster-index?
for i,label_set in enumerate(clustersets):
......@@ -27,19 +26,24 @@ for i,label_set in enumerate(clustersets):
except:
print(f"skipping {label_set}. it is not in the df_cases_file.")
continue
if plot_author_histos:
if 'label_author' in df:
authors_labels = df["label_author"]
authors_of_cluster = [authors_labels[i] for i, label in enumerate(cluster_labels) if
label == cluster]
authors = np.asarray(authors_of_cluster)
x = [-1,0,1,2,3]
h = []
for l in x:
h.append(sum([1 for a in authors if a == l]))
plt.bar(x, height=h)
authors_of_cluster = [authors_labels[i] for i, label in enumerate(cluster_labels) if
label == cluster]
authors = np.asarray(authors_of_cluster)
x = [-1,0,1,2,3]
h = []
for l in x:
h.append(sum([1 for a in authors if a == l]))
plt.bar(x, height=h)
plt.title(label_set + " authors in cluster " + str(cluster))
file_path = 'TextClustering/plots/histograms/histogram_' + label_set + "_cluster" + str(cluster) + "_authors.png"
plt.title(label_set + " authors in cluster " + str(cluster))
file_path = 'TextClustering/plots/histograms/histogram_' + label_set + "_cluster" + str(cluster) + "_authors.png"
else:
print(f'Cant plot author histos, there is not "label_author" in df_cases.')
else:
labels = np.asarray([l for l in cluster_labels if l != -1])
......
......@@ -13,8 +13,7 @@ df_cases_file = "database/df_cases.pkl"
def save_umap_plot(clustersetname, df, title=None):
if not 'label_' + clustersetname in df:
print("skipping " + clustersetname + ", it is not in df_cases_file:")
print(df)
print("skipping " + clustersetname + ", it is not in df_cases_file.")
return
predictedCluster_text_features = label_list_as_int_list(df['label_' + clustersetname])
......@@ -39,7 +38,7 @@ def save_umap_plot(clustersetname, df, title=None):
golden_labels = df["label_golden"]
cluster_scatter_plot(umap_text_features2D, golden_labels,
"TextClustering/plots/UMAP/" + clustersetname + "_UMAP_goldenlabel.png",
show_plot=False, colorblindfriendly=True
show_plot=False, colorblindfriendly=False
, fig_title=title + " colored with golden labels")
def main():
......@@ -63,10 +62,11 @@ def main():
"TextClustering/plots/PCA/LDA_PCA.png",
show_plot=False, colorblindfriendly=False,
fig_title="LDA (PCA representation)")
cluster_scatter_plot(features2D, df["label_author"],
"TextClustering/plots/PCA/LDA_PCA_authors.png",
show_plot=False, colorblindfriendly=True,
number_data_points=False, fig_title='LDA (PCA representation), colored by authors')
if 'label_author' in df:
cluster_scatter_plot(features2D, df["label_author"],
"TextClustering/plots/PCA/LDA_PCA_authors.png",
show_plot=False, colorblindfriendly=True,
number_data_points=False, fig_title='LDA (PCA representation), colored by authors')
if __name__ == '__main__':
main()
......@@ -169,7 +169,12 @@ def cluster_scatter_plot(umap_text_features2D, labels, file_path = [],
nummerate_clusters_in_plot(x,y,c)
'''plt.legend(handles=scatter.legend_elements()[0],
labels=[str(l) for l in c], loc="best")'''
plt.colorbar(values=[int(e) for e in np.unique(np.asarray(c))])
v = [int(e) for e in np.unique(np.asarray(c))]
try:
plt.colorbar(values=v)
except Exception as message:
print(f'failed to generate colorbar for {file_path}')
print(f'{message}')
c = [e for e in valid_labels if e > 19]
if len(c)>0:
......
{"source_data": "../DataNephroTexts/description", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
{"source_data": "./database/description", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
{"source_data": "../DataNephroTexts/diagnosis", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
{"source_data": "./database/diagnosis", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
......@@ -19,7 +19,7 @@ def get3parts(t_file):
#%% get start codons for description and diagnosis
# thinking that one pathologist sticks to his/her wording
start_codon_description = find_codon(t_text,
['Lichtmikroskopie:', 'Mikroskopie:', "Histologie:", "Klinische Angaben:", "Wir erhielten"])
['Lichtmikroskopie:', 'Mikroskopie:', "Histologie:", "Klinische Angaben:", "Wir erhielten ", "Eingesandt wurde:"])
start_codon_2nd = find_codon(t_text,
["Nachbericht", "Immunhistochemie"])
......
......@@ -41,6 +41,7 @@ parser.add_argument("--text_encoding",
parser.add_argument("--use_newest_reports", action='store_true')
parser.add_argument("--label_in_filename_symbol",
default='')
parser.add_argument("--save_labels_as_ints", action='store_true')
args = parser.parse_args()
# %% get all files
......@@ -119,7 +120,8 @@ for idx, t_file in tqdm(enumerate(report_file_list)):
encoding=args.text_encoding) as text_file:
text_file.write(label)
lst_labels.append('label#' + str(idx) + ".txt")
# lst_labels.append('label#' + str(idx) + ".txt")
lst_labels.append(label)
lst_description.append('description#' + str(idx) + ".txt")
lst_diagnose.append('diagnosis#' + str(idx) + ".txt")
......@@ -146,18 +148,29 @@ if args.label_in_filename_symbol == '':
df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end)),
columns=['description_text_files', 'diagnosis_text_files', 'end_text_files'])
else:
df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end, lst_labels)),
columns=['description_text_files', 'diagnosis_text_files', 'end_text_files', 'label_files'])
if args.save_labels_as_ints:
class2int = dict()
for i, c in enumerate(list(set(lst_labels))):
class2int[c] = i
df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end, [class2int[l] for l in lst_labels])),
columns=['description_text_files', 'diagnosis_text_files', 'end_text_files', 'label_golden'])
with open(args.target_folder_path + "/classname2integer.txt", "w") as text_file:
text_file.write('\n'.join([f'{class2int[name]}\t\t{name}' for name in list(set(lst_labels))]))
else:
df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end, lst_labels)),
columns=['description_text_files', 'diagnosis_text_files', 'end_text_files', 'label_golden'])
df.to_pickle(args.df_cases_file)
# search for authors in end-sections in order to add them as labels to the df_cases file:
try:
add_author_labels_to_df_cases(args.target_folder_path + '/end', args.author_names.split(' '), args.df_cases_file)
except:
print("label the reports with authors failed.")
if args.label_in_filename_symbol == '':
try:
add_author_labels_to_df_cases(args.target_folder_path + '/end', args.author_names.split(' '), args.df_cases_file)
except:
print("label the reports with authors failed.")
df = pd.read_pickle(args.df_cases_file)
print(f"saved df_cases at {args.df_cases_file}\n")
print(f'it looks like this:')
print(df)
print(f"first element of {args.df_cases_file}: \n{df.iloc[0]}")
print()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment