added first version

d5224af3 · Maximilian Legnar · a6bfb3de · d5224af3 · d5224af3 · d5224af3
Commit d5224af3 authored Jul 13, 2022 by Maximilian Legnar
48 changed files
--- a/CorpusHomogeneity/cluster_entropy.py
+++ b/CorpusHomogeneity/cluster_entropy.py
+
+def cluster_entropy(df):
+
+    #%% import
+    from CorpusHomogeneity.text_entropy import corpus_entropy
+    import numpy as np
+
+    #%%
+    cluster_ids = np.unique(df.cluster)
+    ent_mean, ent_std = [None] * len(cluster_ids), [None] * len(cluster_ids)
+
+    for idx, i_cluster in enumerate(cluster_ids):
+        if i_cluster == -1: # important to ignore non-clusters texts
+            ent_mean[idx] = np.nan
+            ent_std[idx] = np.nan
+        else:
+            t_corpus = df[df['cluster'] == i_cluster]
+            t_corpus = t_corpus.text.tolist()
+            ent_mean[idx], ent_std[idx] = corpus_entropy(t_corpus)
+
+    #%% output
+    ent_mean = np.nanmean(ent_mean)
+    ent_std = np.nanstd(ent_mean)
+
+    return ent_mean, ent_std
\ No newline at end of file
--- a/CorpusHomogeneity/text_entropy.py
+++ b/CorpusHomogeneity/text_entropy.py
+
+
+#%% tokenize text
+def tokenize_corpus(corpus):
+
+    #%% imports
+    import nltk
+    import string
+    from nltk.tokenize import word_tokenize
+    from HanTa import HanoverTagger as ht
+    stop_words = nltk.corpus.stopwords.words('german')
+    tagger = ht.HanoverTagger('morphmodel_ger.pgz')
+    from tqdm import tqdm
+
+    # %% read the files to a list
+    corpus_tokenized = corpus
+    for idx, t_text in tqdm(enumerate(corpus_tokenized)):
+
+        #%% get the words from the text
+        t_text = str(t_text)
+        tokens = word_tokenize(t_text, language='german')
+        tokens = list(filter(lambda token: token not in string.punctuation, tokens))
+
+        #%% get only the nouns
+        nouns = tagger.tag_sent(tokens)
+        nouns = [lemma for (word, lemma, pos) in nouns if pos == "NN" or pos == "NE"]
+
+        #%% mount it back
+        corpus_tokenized[idx] = nouns
+
+    #%% output-layer
+
+    return corpus_tokenized
+
+#%% calculate the entropy
+def corpus_entropy(corpus):
+    '''
+        how much differ the docs, compared to the whole corpus?
+    '''
+    #%% input layer
+    #corpus_tokenized = tokenize_corpus(corpus)
+    import numpy as np
+    text1 = np.asarray(corpus[0])
+    is_tokenized = bool(text1.ndim)
+    if is_tokenized:
+        corpus_not_tokenized = [" ".join(i_text) for i_text in corpus]
+    else:
+        corpus_not_tokenized = corpus
+        # corpus_not_tokenized = [nltk.tokenize.word_tokenize(i_text, language='german') for i_text in corpus]
+
+    #%% count the word-occurences
+    from sklearn.feature_extraction.text import CountVectorizer
+    import numpy as np
+    vectorizer = CountVectorizer()
+    try:
+        X = vectorizer.fit_transform(corpus_not_tokenized)
+    except:
+        return np.nan, np.nan
+
+    df = X.toarray()
+
+    #%% calculate the entropy
+    from scipy.stats import entropy
+    import numpy as np
+    corspus_tf = sum(df)
+    corpus_mean = np.mean(df,0)
+    ent_values = []
+    for i in range(0, df.shape[0]):
+        document_tf = df[i, :]
+        a = entropy(document_tf, qk=corspus_tf) #
+        ent_values.append(a)
+
+    #%% output-layer
+    entropy_mean = np.nanmean(ent_values)
+    entropy_std = np.nanstd(ent_values)
+
+#%%
+    return entropy_mean, entropy_std
+
+if __name__ == '__main__':
+    corpus_entropy()
\ No newline at end of file
--- a/LanguageModelling/generate_dataset_for_mlm.py
+++ b/LanguageModelling/generate_dataset_for_mlm.py
+# -*- coding: iso-8859-1 -*-
+import random
+import os, sys
+from os import listdir
+from os.path import isfile, join
+import pyarrow as pa
+import pandas as pd
+import datasets
+from database_preparation.utils_stringpreparation import read_german_text
+import argparse
+
+# parse arguments:
+sys.path.append(os.getcwd())
+parser = argparse.ArgumentParser()
+parser.add_argument("--path_to_reports",
+                    default='../DataNephroTexts/reports')
+parser.add_argument("--output_path",
+                    default='./LanguageModelling/hf_nephro_set_1')
+parser.add_argument("--percentage_train_amount", type=float, default=0.9)
+args = parser.parse_args()
+
+def save_as_hf_dataset(datapath, file_id_list, output_path):
+    report_texts = []
+    for id in file_id_list:
+        text = (read_german_text(datapath + str("/") + str(id)))
+        report_texts.append(text)
+    df = pd.DataFrame({
+        'text': report_texts
+    })
+    mytable = pa.Table.from_pandas(df)
+    my_dataset = datasets.Dataset(mytable)
+    #my_dataset.save_to_disk(output_path)
+    my_dataset.to_json(output_path + ".json")
+    print(f"Generated {output_path}")
+
+def main():
+
+    print("processing " + args.path_to_reports)
+
+    reports = [f for f in listdir(args.path_to_reports) if isfile(join(args.path_to_reports, f))]
+
+    reps0 = [r for r in reports if r[-5] == '0']
+    random.shuffle(reps0)
+    last_index = len(reps0) - 1
+    until = int(args.percentage_train_amount * last_index)
+
+    if args.percentage_train_amount < 1:
+        train = [e for i, e in enumerate(reps0) if i <= until]
+        val = [e for i, e in enumerate(reps0) if i > until]
+        save_as_hf_dataset(args.path_to_reports, train, args.output_path + "_train")
+        save_as_hf_dataset(args.path_to_reports, val, args.output_path + "_validation")
+    else:
+        save_as_hf_dataset(args.path_to_reports, reps0, args.output_path)
+
+
+    # how to load dataset:
+    '''ds = datasets.load_from_disk("./LanguageModelling/path2set")
+    print(ds)'''
+
+    return 0
+
+
+if __name__ == "__main__":
+    main()
--- a/LanguageModelling/run_mlm.py
+++ b/LanguageModelling/run_mlm.py
+# -*- coding: iso-8859-1 -*-
+
+'''
+This script is based on:
+https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling
+
+The passed training text must be of type "datasets.Dataset"
+You can use generate_dataset_for_mlm.py to convert a folder with .txt files into a huggingface Dataset.
+Use argument --no_cuda if its not working with cuda.
+If gpu memory to small: Reduce --per_device_train_batch_size.
+
+Program arguments mainly same as we can find in
+transformers.TrainingArguments:
+https://huggingface.co/docs/transformers/v4.17.0/en/main_classes/trainer#transformers.TrainingArguments
+
+Good explanations and tips:
+https://colab.research.google.com/github/gmihaila/ml_things/blob/master/notebooks/pytorch/pretrain_transformers_pytorch.ipynb#scrollTo=E1F-XIQCdOgj
+'''
+
+# example arguments for loss curves analysis:
+'''
+--model_name_or_path bert-base-german-cased
+--train_file ./LanguageModelling/hf_nephro_set_3_train.json
+--validation_file ./LanguageModelling/hf_nephro_set_3_validation.json
+--output_dir ./LanguageModelling/ger-patho-bert-v3
+--do_train
+--do_eval
+--overwrite_output_dir
+--num_train_epochs 10
+--evaluation_strategy steps
+--logging_steps 30
+--whole_word_mask True
+--per_device_train_batch_size=8
+'''
+
+# example arguments for final training:
+'''
+--model_name_or_path bert-base-german-cased
+--train_file ./LanguageModelling/hf_nephro_set_3.json
+--output_dir ./LanguageModelling/ger-patho-bert-v3
+--do_train
+--overwrite_output_dir
+--num_train_epochs 1
+--whole_word_mask True
+--per_device_train_batch_size=8
+'''
+
+
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) on a text file or a dataset.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=masked-lm
+"""
+# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
+
+import logging
+import math
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import datasets
+from datasets import load_dataset
+
+import transformers
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_FOR_MASKED_LM_MAPPING,
+    AutoConfig,
+    AutoModelForMaskedLM,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    DataCollatorForWholeWordMask,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.13.0.dev0")
+# use pip install git+https://github.com/huggingface/transformers to install 4.13 from source!
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
+
+logger = logging.getLogger(__name__)
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_overrides: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Override some existing default config settings when a model is trained from scratch. Example: "
+            "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
+        },
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+    def __post_init__(self):
+        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
+            raise ValueError(
+                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
+            )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what database_preparation we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training database_preparation file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation database_preparation file to evaluate the perplexity on (a text file)."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+
+    whole_word_mask: bool = field(
+        default=False,
+        metadata={
+            "help": "Wether to use whole-word-masking, defaults False. if False: Use subword-masking"
+        },
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                if extension not in ["csv", "json", "txt"]:
+                    raise ValueError("`train_file` should be a csv, a json or a txt file.")
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                if extension not in ["csv", "json", "txt"]:
+                    raise ValueError("`validation_file` should be a csv, a json or a txt file.")
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this
+    # behavior (see below)
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
+
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            raw_datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+
+        # If no validation database_preparation is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config_kwargs = {
+        "cache_dir": model_args.cache_dir,
+        "revision": model_args.model_revision,
+        "use_auth_token": True if model_args.use_auth_token else None,
+    }
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+        if model_args.config_overrides is not None:
+            logger.info(f"Overriding config: {model_args.config_overrides}")
+            config.update_from_string(model_args.config_overrides)
+
+    tokenizer_kwargs = {
+        "cache_dir": model_args.cache_dir,
+        "use_fast": model_args.use_fast_tokenizer,
+        "revision": model_args.model_revision,
+        "use_auth_token": True if model_args.use_auth_token else None,
+    }
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    if model_args.model_name_or_path:
+        model = AutoModelForMaskedLM.from_pretrained(
+            model_args.model_name_or_path,
+            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+            revision=model_args.model_revision,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForMaskedLM.from_config(config)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = raw_datasets["train"].column_names
+    else:
+        column_names = raw_datasets["validation"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    if data_args.max_seq_length is None:
+        max_seq_length = tokenizer.model_max_length
+        if max_seq_length > 1024:
+            logger.warning(
+                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
+            )
+            max_seq_length = 1024
+    else:
+        if data_args.max_seq_length > tokenizer.model_max_length:
+            logger.warning(
+                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+            )
+        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    if data_args.line_by_line:
+        # When using line_by_line, we just tokenize each nonempty line.
+        padding = "max_length" if data_args.pad_to_max_length else False
+
+        def tokenize_function(examples):
+            # Remove empty lines
+            examples[text_column_name] = [
+                line for line in examples[text_column_name] if len(line) > 0 and not line.isspace()
+            ]
+            return tokenizer(
+                examples[text_column_name],
+                padding=padding,
+                truncation=True,
+                max_length=max_seq_length,
+                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
+                # receives the `special_tokens_mask`.
+                return_special_tokens_mask=True,
+            )
+
+        with training_args.main_process_first(desc="dataset map tokenization"):
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=[text_column_name],
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on dataset line_by_line",
+            )
+    else:
+        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+        # efficient when it receives the `special_tokens_mask`.
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+
+        with training_args.main_process_first(desc="dataset map tokenization"):
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on every text in dataset",
+            )
+
+        # Main database_preparation processing function that will concatenate all texts from our dataset and generate chunks of
+        # max_seq_length.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            if total_length >= max_seq_length:
+                total_length = (total_length // max_seq_length) * max_seq_length
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+
+        with training_args.main_process_first(desc="grouping texts together"):
+            tokenized_datasets = tokenized_datasets.map(
+                group_texts,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc=f"Grouping texts in chunks of {max_seq_length}",
+            )
+
+    if training_args.do_train:
+        if "train" not in tokenized_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = tokenized_datasets["train"]
+        if data_args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+
+    if training_args.do_eval:
+        if "validation" not in tokenized_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = tokenized_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length
+
+    if data_args.whole_word_mask:
+        data_collator = DataCollatorForWholeWordMask(
+            tokenizer=tokenizer,
+            mlm_probability=data_args.mlm_probability,
+            pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
+        )
+
+        logger.info("*** train with whole word ***")
+    else:
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer,
+            mlm_probability=data_args.mlm_probability,
+            pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
+        )
+
+        logger.info("*** train with sub-word masking ***")
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+        metrics = train_result.metrics
+
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        metrics = trainer.evaluate()
+
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+        try:
+            perplexity = math.exp(metrics["eval_loss"])
+        except OverflowError:
+            perplexity = float("inf")
+        metrics["perplexity"] = perplexity
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
+    if data_args.dataset_name is not None:
+        kwargs["dataset_tags"] = data_args.dataset_name
+        if data_args.dataset_config_name is not None:
+            kwargs["dataset_args"] = data_args.dataset_config_name
+            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+        else:
+            kwargs["dataset"] = data_args.dataset_name
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/README.md
+++ b/README.md
 # NLP in diagnostic texts from nephropathology

 This python project was created as part of the article "Natural Language Processing in diagnostic texts from
-nephropathology" and will be uploaded soon (refactor works in progress).
\ No newline at end of file
+nephropathology".
+
+The paper can be found [here](LINK).
+
+The scripts ```database_preparation/data_preparation_pipeline.py```, ```TextClustering/clustering_pipeline.py```
+and ```TextClassification/classification_pipeline.py``` gives an idea of how this project can be used with other datasets.
+
+The scripts ```TextClustering/basedOn_BOW/kmeans_Diagnosis.py```, 
+```TextClustering/basedOn_BOW/HDBSCAN_Diagnosis.py``` and ```TextClassification/bow_classification.py```
+can also process tf-idf vectorized corpora. 
+All other scripts can only process corpora that are not vectorized.
+
+Feel free to use and adapt the scripts to your own needs.
+
+
+## Requirements
+
+For preprocessing, the project requires some nltk corporas:
+```
+import nltk
+nltk.download('stopwords')
+nltk.download('punkt')
+```
\ No newline at end of file
--- a/TextClassification/CNN_classification.py
+++ b/TextClassification/CNN_classification.py
+import numpy as np
+import tensorflow_datasets as tfds
+import tensorflow as tf
+tfds.disable_progress_bar()
+from matplotlib import pyplot as plt
+import pandas as pd
+import os
+import sys
+from TextClassification.argsparse_classification_preamble import argsparse_preamble
+import classification_metrics as cls_metrics
+import time
+from database_preparation.utils_labeled_datasets import get_splits_for_cross_val
+from keras.models import Sequential
+from keras.layers import Dense
+from keras.layers import Flatten
+from keras.layers.convolutional import Conv1D
+from keras.layers.convolutional import MaxPooling1D
+#from keras.layers.embeddings import Embedding
+from tensorflow.keras.layers import Embedding
+from keras_preprocessing.sequence import pad_sequences
+from database_preparation.utils_labeled_datasets import text_label_files_to_labeled_dataset
+from database_preparation.preprocess import print_meta_data
+sys.path.append(os.getcwd())
+
+def plot_graphs(history, metric):
+    plt.close()
+    plt.plot(history.history[metric])
+    plt.plot(history.history['val_'+metric])
+    plt.xlabel("Epochs")
+    plt.ylabel(metric)
+    plt.legend([metric, 'val_'+metric])
+    plt.show()
+
+def save_graphs(history, metric, save_path):
+    plt.close()
+    plt.plot(history.history[metric])
+    plt.plot(history.history['val_' + metric])
+    plt.xlabel("Epochs")
+    plt.ylabel(metric)
+    plt.legend([metric, 'val_' + metric])
+    #plt.show()
+    plt.savefig(save_path, dpi=300)
+    print("generated "+save_path)
+
+def dict2tf_dataset(dict):
+    return tf.data.TextLineDataset.from_tensor_slices(([text for text in dict['text']], dict['label']))
+
+def train_test_updatemetrics(train_dataset, test_dataset, num_classes, metrics,
+                             num_epochs=50, plot_loss=False,
+                             plot_save_path="TextClassification/plots/CNN/CNN_loss.png"):
+    train_dataset = dict2tf_dataset(train_dataset)
+    test_dataset = dict2tf_dataset(test_dataset)
+
+    ############### Create the text encoder ###################
+    VOCAB_SIZE = 5000
+    encoder = tf.keras.layers.TextVectorization(
+        max_tokens=VOCAB_SIZE)
+    encoder.adapt(train_dataset.map(lambda text, label: text))
+
+    vocab = np.array(encoder.get_vocabulary())
+
+    # encode data to word-indices:
+    X_train = []
+    y_train = []
+    X_test = []
+    y_test = []
+    for text, label in train_dataset:
+        X_train.append(encoder(text).numpy())
+        y_train.append(label)
+    for text, label in test_dataset:
+        X_test.append(encoder(text).numpy())
+        y_test.append(label)
+    X_train = np.asarray(X_train)
+    y_train = np.asarray(y_train)
+    X_test = np.asarray(X_test)
+    y_test = np.asarray(y_test)
+
+    ##### create the model: #####
+
+    # Padding the data samples to a maximum review length in words
+    max_words = 450
+    X_train = pad_sequences(X_train, maxlen=max_words)
+    X_test = pad_sequences(X_test, maxlen=max_words)
+
+    # Building the CNN Model
+    embedding_dim = 100
+    filter_amount = 32
+    filter_size = 3
+    model = Sequential()  # initilaizing the Sequential nature for CNN model
+    model.add(Embedding(len(encoder.get_vocabulary()), embedding_dim, input_length=max_words, mask_zero=True))
+    model.add(Conv1D(filter_amount, filter_size, padding='same', activation='relu'))
+    model.add(MaxPooling1D())
+    model.add(Flatten())
+    model.add(Dense(250, activation='relu'))
+    model.add(Dense(num_classes, activation='softmax'))
+    model.compile(loss='sparse_categorical_crossentropy',
+                  optimizer=tf.keras.optimizers.Adam(1e-4),
+                  metrics=["accuracy"])
+    # model.summary()
+
+
+    start = time.time()
+
+    # evaluate:
+    if plot_loss:
+        history = model.fit(X_train, y_train, epochs=num_epochs,
+                            batch_size=128, verbose=2,
+                            validation_data=(X_test, y_test),
+                            validation_steps=1,
+                            )
+        save_graphs(history, 'loss', plot_save_path)
+
+    else:
+        model.fit(X_train, y_train, epochs=num_epochs,
+                  batch_size=128, verbose=2)
+        predictions = model.predict(X_test)
+        y_pred = np.argmax(predictions, axis=-1)
+        metrics.update_metrics(y_test, y_pred, True, start)
+
+def main():
+    ############# get labeled text data ###################
+
+    args = argsparse_preamble()
+
+    print("CNN Evaluation with corpus " + args.path2corpus + " and cluster set " + args.clustered_data)
+    print("infos about corpus:")
+    print_meta_data(args.path2corpus)
+
+    dataset = text_label_files_to_labeled_dataset(args.clustered_data, path2corpus=args.path2corpus)
+
+    num_classes = int(pd.DataFrame(dataset["label"]).nunique())
+
+    metrics = cls_metrics.ClassificationMetrics("CNN")
+
+    epochs = 100
+    folds = 10
+
+    for i, (train_dataset, test_dataset) in enumerate(get_splits_for_cross_val(dataset, folds)):
+        if args.loss_curve_check:
+            train_test_updatemetrics(train_dataset, test_dataset, num_classes, metrics,
+                                     epochs, plot_loss=True,
+                                     plot_save_path="TextClassification/plots/CNN/CNN_loss_"+str(i+1)+".png")
+        else:
+            print("====== CNN train/test run " + str(i+1) + "/" + str(folds) + " ======")
+            print(str(len(train_dataset["label"]))+" train documents")
+            print(str(len(test_dataset["label"])) + " test documents")
+            train_test_updatemetrics(train_dataset, test_dataset, num_classes, metrics, epochs)
+
+    if not args.loss_curve_check:
+        metrics.save_scores_to_disk(args.clustered_data)
+        metrics.pickle_object(args.clustered_data)
+        cls_metrics.print_results_as_latextable(metrics.json_file_path)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/TextClassification/RNN_classification.py
+++ b/TextClassification/RNN_classification.py
+import numpy as np
+import tensorflow as tf
+from matplotlib import pyplot as plt
+import pandas as pd
+import os
+import sys
+sys.path.append(os.getcwd())
+from TextClassification.argsparse_classification_preamble import argsparse_preamble
+import TextClassification.classification_metrics as cls_metrics
+import time
+from database_preparation.utils_labeled_datasets import get_splits_for_cross_val
+from database_preparation.utils_labeled_datasets import text_label_files_to_labeled_dataset
+from database_preparation.preprocess import print_meta_data
+
+
+def save_graphs(history, metric, save_path):
+    plt.close()
+    plt.plot(history.history[metric])
+    plt.plot(history.history['val_' + metric])
+    plt.xlabel("Epochs")
+    plt.ylabel(metric)
+    plt.legend([metric, 'val_' + metric])
+    #plt.show()
+    plt.savefig(save_path, dpi=300)
+    print("generated "+save_path)
+
+
+def dict2tf_dataset(dict):
+    return tf.data.TextLineDataset.from_tensor_slices(([text.lower() for text in dict['text']], dict['label']))
+
+
+def train_test_updatemetrics(train_dataset, test_dataset, num_classes, metrics,
+                             epochs=10, plot_loss=False,
+                             plot_save_path="TextClassification/plots/RNN/RNN_loss.png"):
+    y_test = np.asarray(test_dataset['label'])
+
+    train_dataset = dict2tf_dataset(train_dataset)
+    test_dataset = dict2tf_dataset(test_dataset)
+
+    # Next shuffle the data for training and create batches of these (text, label) pairs:
+    BUFFER_SIZE = 10000
+    BATCH_SIZE = 64
+    train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
+    test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
+
+    for example, label in train_dataset.take(1):
+        '''print('text: ', example.numpy())
+        print('label: ', label.numpy())'''
+        pass
+
+    ############### Create the text encoder ###################
+    VOCAB_SIZE = 5000
+    encoder = tf.keras.layers.TextVectorization(
+        max_tokens=VOCAB_SIZE)
+    encoder.adapt(train_dataset.map(lambda text, label: text))
+
+    vocab = np.array(encoder.get_vocabulary())
+    '''print("vocab info:")
+    print(vocab[:20])
+    print(len(encoder.get_vocabulary()))
+
+    encoded_example = encoder(example)[:3].numpy()
+    print(encoded_example)
+
+    for n in range(3):
+        print("Original: ", example[n].numpy())
+        print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
+        print()'''
+
+    ##### create the model: #####
+    embedding_dim = 64
+    model = tf.keras.Sequential([
+        encoder,
+        tf.keras.layers.Embedding(
+            input_dim=len(encoder.get_vocabulary()),
+            output_dim=embedding_dim,
+            # Use masking to handle the variable sequence lengths
+            mask_zero=True),
+        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
+        tf.keras.layers.Dense(embedding_dim, activation='relu'),
+        tf.keras.layers.Dense(num_classes, activation='softmax')
+    ])
+    # model.summary()
+
+    # stacking 2 LSTM layers (seems to be much worse):
+    '''model = tf.keras.Sequential([
+        encoder,
+        tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
+        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
+        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
+        tf.keras.layers.Dense(64, activation='relu'),
+        tf.keras.layers.Dropout(0.5),
+        tf.keras.layers.Dense(num_classes, activation='softmax')
+    ])'''
+
+    # All the layers after the Embedding support masking:
+    # print([layer.supports_masking for layer in model.layers])
+
+    # predict on a sample text without padding.
+    '''print("test prediction:")
+    sample_text = ('The movie was cool. The animation and the graphics '
+                   'were out of this world. I would recommend this movie.')
+    predictions = model.predict(np.array([sample_text]))
+    print(predictions)'''
+
+    model.compile(loss='sparse_categorical_crossentropy',
+                  optimizer=tf.keras.optimizers.Adam(1e-4),
+                  metrics=["accuracy"])
+
+    start = time.time()
+
+    # evaluate:
+    if plot_loss:
+        history = model.fit(train_dataset, epochs=epochs,
+                            validation_data=test_dataset,
+                            validation_steps=2)
+        test_loss, test_acc = model.evaluate(test_dataset)
+        print('Test Loss:', test_loss)
+        print('Test Accuracy:', test_acc)
+        #save_graphs(history, 'accuracy')
+        save_graphs(history, 'loss', plot_save_path)
+
+    else:
+        model.fit(train_dataset, epochs=epochs)
+        predictions = model.predict(test_dataset)
+        y_pred = np.argmax(predictions, axis=-1)
+        metrics.update_metrics(y_test, y_pred, True, start)
+
+
+def main():
+    ############# get labeled text data ###################
+
+    # how to convert words 2 ids with gensim:
+    # words = corpora.Dictionary(diag_lst)
+
+    args = argsparse_preamble()
+
+    print("RNN Evaluation with corpus " + args.path2corpus + " and cluster set " + args.clustered_data)
+    print("infos about corpus:")
+    print_meta_data(args.path2corpus.replace('.pkl', '_meta.pkl'))
+
+    # dataset = load_labeled_dataset(args.clustered_data)
+    dataset = text_label_files_to_labeled_dataset(args.clustered_data, path2corpus=args.path2corpus)
+
+    num_classes = int(pd.DataFrame(dataset["label"]).nunique())
+
+    metrics = cls_metrics.ClassificationMetrics("RNN")
+
+    folds = 10
+    epochs = 70
+
+    for i, (train_dataset, test_dataset) in enumerate(get_splits_for_cross_val(dataset, folds)):
+        if args.loss_curve_check:
+            train_test_updatemetrics(train_dataset, test_dataset, num_classes,
+                                     metrics, epochs=epochs, plot_loss=True,
+                                     plot_save_path="TextClassification/plots/RNN/RNN_loss_"+str(i+1)+".png")
+        else:
+            print("====== RNN train/test run " + str(i + 1) + "/" + str(folds) + " ======")
+            print(str(len(train_dataset["label"])) + " train documents")
+            print(str(len(test_dataset["label"])) + " test documents")
+            train_test_updatemetrics(train_dataset, test_dataset, num_classes, metrics, epochs=epochs)
+
+    if not args.loss_curve_check:
+        metrics.save_scores_to_disk(args.clustered_data)
+        metrics.pickle_object(args.clustered_data)
+        cls_metrics.print_results_as_latextable(metrics.json_file_path)
+
+
+if __name__ == '__main__':
+    main()
--- a/TextClassification/argsparse_classification_preamble.py
+++ b/TextClassification/argsparse_classification_preamble.py
+import argparse
+import sys, os
+def argsparse_preamble():
+
+    sys.path.append(os.getcwd())
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--overwrite", action='store_true')#False: generate data only if it doesn already exist
+    parser.add_argument("--show_figures", action='store_true')
+    parser.add_argument("--clustered_data", default="HDBSCAN")
+    parser.add_argument("--path2corpus", default="database/bow_prepro_desc.pkl")
+    parser.add_argument("--loss_curve_check", action='store_true')
+
+    args = parser.parse_args()
+
+
+    return args
\ No newline at end of file
--- a/TextClassification/bert_classification.py
+++ b/TextClassification/bert_classification.py
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import datasets
+import torch
+import pandas as pd
+import numpy as np
+from transformers import Trainer
+from transformers import TrainingArguments
+import os
+import sys
+import pyarrow as pa
+
+sys.path.append(os.getcwd())
+from TextClassification.argsparse_classification_preamble import argsparse_preamble
+import TextClassification.classification_metrics as cls_metrics
+from database_preparation.utils_labeled_datasets import get_splits_for_cross_val
+from database_preparation.utils_labeled_datasets import text_label_files_to_labeled_dataset
+from database_preparation.preprocess import print_meta_data
+from database_preparation.utils_labeled_datasets import is_text_lst_tokenized, is_text_lst_tfidf_vectorized
+
+args = argsparse_preamble()
+
+models_save_path = "./TextClassification/models/bert_models_new"
+
+if not os.path.isdir(models_save_path):
+    os.makedirs(models_save_path)
+
+
+########## functions ##########
+
+def train(train_set, test_set, classifier_save_path, base_bert_model,
+          overwrite=False, track_loss_curves=True, epochs=3,
+          learning_rate=5e-5, save_model=True, cuda_batch_size=8):
+    '''
+    trains and saves the model + train/test-data at classifier_save_path
+    '''
+    if save_model:
+        if os.path.isdir(classifier_save_path):
+            if overwrite:
+                print(classifier_save_path + " already exists! (overwriting old model!)")
+            else:
+                print(classifier_save_path + " already exists! (skipping training)")
+                return
+
+    # This will issue a warning about some of the pretrained weights not being used and some weights being randomly initialized.
+    # That’s because we are throwing away the pretraining head of the BERT model to replace it with a classification head which is randomly initialized.
+    # We will fine-tune this model on our task, transferring the knowledge of the pretrained model to it (which is why doing this is called transfer learning).
+    if test_set == None:
+        num_labels = len(np.unique(train_set["label"]))
+    else:
+        num_labels = len(np.unique(train_set["label"] + test_set["label"]))
+    model = AutoModelForSequenceClassification.from_pretrained(base_bert_model, num_labels=num_labels)
+
+    if torch.cuda.is_available():
+        batch_size = cuda_batch_size
+    else:
+        batch_size = 8
+
+    if track_loss_curves:
+        training_args = TrainingArguments(classifier_save_path + "/trainer",
+                                          overwrite_output_dir=True,
+                                          save_steps=2000,
+                                          do_train=True,
+                                          do_eval=True,
+                                          num_train_epochs=epochs,
+                                          evaluation_strategy='steps',
+                                          logging_steps=2000,
+                                          per_device_train_batch_size=batch_size,
+                                          learning_rate=learning_rate
+                                          )
+    else:
+        training_args = TrainingArguments(classifier_save_path + "/trainer",
+                                          overwrite_output_dir=True,
+                                          save_steps=2000,
+                                          num_train_epochs=epochs,
+                                          logging_steps=2000,
+                                          per_device_train_batch_size=batch_size,
+                                          learning_rate=learning_rate
+                                          )
+
+    print("training args: " + str(training_args.to_dict()))
+    print("device:" + str(training_args.device))
+    print("gpus: " + str(training_args.n_gpu))
+
+    trainer = Trainer(
+        model=model, args=training_args, train_dataset=train_set, eval_dataset=test_set
+    )
+
+    # training
+    train_result = trainer.train()
+
+    if track_loss_curves:
+        # compute train results
+        metrics = train_result.metrics
+        metrics["train_samples"] = len(train_set)
+
+        # save train results
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+        # compute evaluation results
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(test_set)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # save model
+    if save_model and test_set != None:
+        model.save_pretrained(classifier_save_path)
+
+        hf_data_dict = datasets.DatasetDict({"train": train_set, "test": test_set})
+        hf_data_dict.save_to_disk(classifier_save_path + "/tokenized_train_test_dataset")
+    return model
+
+
+def evaluate_saved_model(classifier_path, metrics_obj):
+    # load model:
+    model = AutoModelForSequenceClassification.from_pretrained(classifier_path, from_tf=False)
+
+    # load tokenized datasets:
+    train_test_set = datasets.DatasetDict.load_from_disk(classifier_path + "/tokenized_train_test_dataset")
+
+    # train_set = train_test_set["train"]
+    test_set = train_test_set["test"]
+
+    evaluate(model, test_set, metrics_obj)
+
+
+def evaluate(model, test_set, metrics_obj):
+    # just use default parameters
+    training_args = TrainingArguments("TextClassification/models/temp_trainer", evaluation_strategy="epoch",
+                                      overwrite_output_dir=True, )
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        # train_dataset=train_set,
+        eval_dataset=test_set
+    )
+    # print(trainer.evaluate())
+
+    predictions = trainer.predict(test_set)
+
+    preds = np.argmax(predictions.predictions, axis=-1)
+
+    metrics_obj.update_metrics(predictions.label_ids, preds, True)
+
+
+def main():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        print("running with cuda")
+
+    label_sets = [args.clustered_data]
+
+    if is_text_lst_tokenized(args.path2corpus):
+        print(f"Error: {args.path2corpus} is a tokenized corpus. Please pass a not tokenized corpus!")
+        exit(1)
+
+    base_bert_models = ["./LanguageModelling/ger-patho-bert-2", "bert-base-german-cased"]
+    # base_bert_models = ["bert-base-german-cased"]
+
+    evaluate_test_set = True
+    do_train = True
+    test_run = False  # runns k-fold cross validation with only one test run for each model
+    folds = 10
+    track_loss_curves = False
+    epochs = 4
+    save_model = False
+    cuda_batch_size = 2
+
+    for label_set in label_sets:
+
+        # train_test_dataset = dt.load_labeled_dataset(label_set)
+        train_test_dataset = text_label_files_to_labeled_dataset(args.clustered_data, path2corpus=args.path2corpus)
+        if train_test_dataset == None:
+            print("cant do bert training without data!")
+            sys.exit()
+
+        # pre-savetrain/test  data for cross validation.
+        # to train and test each model with the same data.
+        k_train_test_sets = []
+        for (train_dataset, test_dataset) in get_splits_for_cross_val(train_test_dataset, folds):
+            k_train_test_sets.append(tuple((train_dataset, test_dataset)))
+
+        for base_bert_model in base_bert_models:
+            print(base_bert_model + " Evaluation with corpus " + args.path2corpus + " and cluster set " + label_set)
+            print("infos about corpus:")
+            print_meta_data(args.path2corpus)
+
+            # compose names, depending on label_set and base bert model:
+            if "./LanguageModelling/" in base_bert_model:
+                # is it a custom LM from our languagemodeling-folder?
+                name = base_bert_model.replace("./LanguageModelling/", "")
+                classifier_path = models_save_path + "/" + name + "_" + label_set + "_ClassificatonModel"
+                metrics = cls_metrics.ClassificationMetrics(name)
+            elif '/' in base_bert_model and not './' in base_bert_model:
+                name = base_bert_model.replace("/", "_")
+                classifier_path = models_save_path + "/" + name + "_" + label_set + "_ClassificatonModel"
+                metrics = cls_metrics.ClassificationMetrics(name)
+            elif "gottbert-base" in base_bert_model:
+                classifier_path = models_save_path + "/gottbert-base_" + label_set + "_ClassificatonModel"
+                metrics = cls_metrics.ClassificationMetrics("gottbert-base")
+            else:  # germanbert
+                classifier_path = models_save_path + "/" + base_bert_model + "_" + label_set + "_ClassificatonModel"
+                metrics = cls_metrics.ClassificationMetrics(base_bert_model)
+            if save_model:
+                print("saving model at: ")
+                print(classifier_path)
+
+            # cross validation:
+            for i, (train_dataset, test_dataset) in enumerate(k_train_test_sets):
+                # convert to dataframe:
+                train_dataset_ds = datasets.Dataset(pa.Table.from_pandas(pd.DataFrame(train_dataset)))
+                test_dataset_ds = datasets.Dataset(pa.Table.from_pandas(pd.DataFrame(test_dataset)))
+
+                # tokenize
+                tokenizer = AutoTokenizer.from_pretrained(base_bert_model)
+
+                def tokenize_function(examples):
+                    return tokenizer(examples["text"], padding="max_length", truncation=True)
+
+                train_set = train_dataset_ds.map(tokenize_function, batched=True)
+                test_set = test_dataset_ds.map(tokenize_function, batched=True)
+
+                # train
+                if do_train:
+                    print("==> training " + classifier_path + "_" + str(i))
+                    model = train(train_set, test_set, classifier_path + "_" + str(i), base_bert_model,
+                                  track_loss_curves=track_loss_curves, epochs=epochs,
+                                  save_model=save_model, cuda_batch_size=cuda_batch_size)
+
+                # evaluate
+                if evaluate_test_set:
+                    if save_model:
+                        print("==> predicting test set with " + classifier_path + "_" + str(i))
+                        evaluate_saved_model(classifier_path + "_" + str(i), metrics)
+                    else:
+                        print("==> predicting test set with " + classifier_path + "_" + str(i))
+                        evaluate(model, test_set, metrics)
+
+                if test_run:
+                    break
+
+            metrics.save_scores_to_disk(label_set)
+            metrics.pickle_object(label_set)
+
+
+if __name__ == '__main__':
+    main()
--- a/TextClassification/bow_classification.py
+++ b/TextClassification/bow_classification.py
+# -*- coding: iso-8859-1 -*-
+import os
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import SGDClassifier  # stochastic gradient descent (SGD) learning
+from sklearn.neural_network import MLPClassifier
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import Pipeline
+from TextClassification.argsparse_classification_preamble import argsparse_preamble
+import database_preparation.utils_labeled_datasets as dt
+from database_preparation.utils_labeled_datasets import get_splits_for_cross_val
+import TextClassification.classification_metrics as cls_metrics
+from database_preparation.preprocess import print_meta_data
+from database_preparation.utils_labeled_datasets import is_text_lst_tfidf_vectorized
+import pickle
+import numpy as np
+import pandas as pd
+'''from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.pipeline import make_pipeline as make_pipeline_imb'''
+
+#expeeriment:
+merge_classes = [(0, 1), (5, 7), (9, 10, 11), (6, 15)]
+
+# for tfidf vectorizer
+def identity(words):
+    return words
+
+
+def create_pipeline(estimator, reduction=False, with_vectorizer=True):
+    '''
+    construct a pipeline with sklearn.pipeline
+    pased estimator will be the last element of the pipeline
+    using tfidf as vectorizer
+    '''
+    steps = []
+
+    if with_vectorizer:
+        steps.append(
+            ('vectorize', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False))
+        )
+
+    if reduction:
+        steps.append((
+            'reduction', TruncatedSVD(n_components=1000)
+        ))
+
+    # Add the estimator
+    steps.append(('classifier', estimator))
+    return Pipeline(steps)
+
+
+def get_immediate_subdirectories(a_dir):
+    return [name for name in os.listdir(a_dir)
+            if os.path.isdir(os.path.join(a_dir, name))]
+
+
+def cross_validate_with_bow_classifiers(label_set, fold_amount=10,
+                                        path2corpus="./database/bow_prepro_desc.pkl",
+                                        df_cases_file="database/df_cases.pkl"):
+
+    '''
+    cross validates passed label_set with text data saved in path2corpus and labels saved in df_cases_file.
+    path2corpus should be a list of reports, where each report is tokenized
+    or a list of tf-idf vectorized texts (of type scipy.sparse.csr.csr_matrix).
+    '''
+
+
+
+    if is_text_lst_tfidf_vectorized(path2corpus):
+        models = []
+        models.append(create_pipeline(MultinomialNB(), with_vectorizer=False))
+        models.append(create_pipeline(MLPClassifier(max_iter=300), with_vectorizer=False))
+        models.append(create_pipeline(LogisticRegression(), with_vectorizer=False))
+        models.append(create_pipeline(SGDClassifier(), with_vectorizer=False))
+
+        with open(path2corpus, 'rb') as f:
+            loaded_texts = pickle.load(f)
+        n = np.asarray(loaded_texts.todense().tolist())
+
+        from database_preparation.utils_labeled_datasets import text_label_2_labeled_dataset
+        df_cases = pd.read_pickle(df_cases_file)
+        dataset = text_label_2_labeled_dataset(n, df_cases["label_" + label_set])
+
+    else:
+        # create model-pipelines for cross-validation with different pipelines:
+        models = []
+        models.append(create_pipeline(SGDClassifier()))
+        models.append(create_pipeline(MultinomialNB()))
+        models.append(create_pipeline(LogisticRegression()))
+        models.append(create_pipeline(MLPClassifier(max_iter=300)))
+        '''models.append(make_pipeline_imb(TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
+                                        , RandomOverSampler(), SGDClassifier()))'''
+
+        # print(f"train models {[model['classifier'] for model in models]} with corpus {path2corpus} and cluster set {label_set}")
+        print("infos about corpus:")
+        print_meta_data(path2corpus)
+
+        dataset = dt.text_label_files_to_labeled_dataset(label_set, path2corpus=path2corpus,
+                                                         df_cases_path=df_cases_file)
+
+    # in order to use same 10-fold-cross-splits for each model:
+    k_train_test_sets = []
+    for (train_dataset, test_dataset) in get_splits_for_cross_val(dataset,
+                                            fold_amount, merge_classes=None, oversample=False, stratified=True):
+        k_train_test_sets.append(tuple((train_dataset, test_dataset)))
+
+    # cross validate each model and save metrics:
+    for model in models:
+        print('running ' + str(model['classifier']))
+        name = model.named_steps['classifier'].__class__.__name__
+        if 'reduction' in model.named_steps:
+            name += " (TruncatedSVD)"
+
+        metrics = cls_metrics.ClassificationMetrics(name)
+
+        for i, (train_dataset, test_dataset) in enumerate(k_train_test_sets):
+            model.fit(train_dataset['text'], train_dataset['label'])
+            y_pred = model.predict(test_dataset['text'])
+
+            metrics.update_metrics(test_dataset['label'], y_pred, False)
+
+        metrics.save_scores_to_disk(label_set)
+        metrics.pickle_object(label_set)
+        df = metrics.classes_scores(-1)
+        print(df.to_latex().replace('{}', 'cluster'))
+
+
+    cls_metrics.print_results_as_latextable(metrics.json_file_path)
+
+
+
+def main():
+    args = argsparse_preamble()
+    cross_validate_with_bow_classifiers(args.clustered_data, path2corpus=args.path2corpus)
+
+
+if __name__ == '__main__':
+    main()
--- a/TextClassification/classification_for_cluster_evaluation.py
+++ b/TextClassification/classification_for_cluster_evaluation.py
+# -*- coding: iso-8859-1 -*-
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import SGDClassifier
+
+import sys
+import database_preparation.utils_labeled_datasets as dt
+
+# for training validation:
+import TextClassification.classification_metrics as cls_metrics
+from sklearn.decomposition import TruncatedSVD
+
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import KFold
+from sklearn.svm import SVC
+from sklearn.pipeline import Pipeline
+import nltk
+import datasets
+import pyarrow as pa
+import pickle
+
+fold_amount = 10
+
+#%%
+# for tfidf vectorizer
+def identity(words):
+    return words
+
+def create_pipeline(estimator, reduction=False):
+    '''
+    construct a pipeline with sklearn.pipeline
+    pased estimator will be the last element of the pipeline
+    using tfidf as vectorizer
+    '''
+    steps = []
+
+    steps.append(
+        ('vectorize', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False))
+    )
+
+    if reduction:
+        steps.append((
+            'reduction', TruncatedSVD(n_components=1000)
+        ))
+
+    # Add the estimator
+    steps.append(('classifier', estimator))
+    return Pipeline(steps)
+
+def cross_validate_with_simple_SVM(label_set, path2corpus = "./database/bow_prepro_diag.pkl", path2dfcases='./database/df_cases.pkl'):
+    """
+    trains a simple SVM with the given data
+    returns 10-fold-cross-validated accuracy value
+    """
+
+    print(f"Calculating SVM-classification performance of {label_set} cluster-setr "
+          f"with text corpus {path2corpus}.")
+
+    metrics = cls_metrics.ClassificationMetrics(label_set)
+
+    #print("train SVM for cluster " + label_set + " with " + path2corpus + ".")
+
+    text_lst = pd.read_pickle(path2corpus)
+    text1 = np.asarray(text_lst[0])
+    corpus_is_tokenized = bool(text1.ndim)
+    del text1, text_lst
+
+    if corpus_is_tokenized:
+        dataset = dt.text_label_files_to_labeled_dataset(label_set,
+                                                         path2corpus
+                                                         , path2dfcases, False)
+    else:
+        dataset_raw = dt.text_label_files_to_labeled_dataset(label_set,
+                                                             path2corpus
+                                                             , path2dfcases, False)
+        # tokenize
+        tokenized_texts = []
+        for t_text in dataset_raw['text']:
+            tokenized_texts.append(nltk.tokenize.word_tokenize(t_text, language='german'))
+        df = pd.DataFrame({'text': tokenized_texts, 'label': dataset_raw['label']})
+        dataset = datasets.Dataset(pa.Table.from_pandas(df))
+
+    # 10-fold crosss validation:
+    folds = KFold(n_splits=10, shuffle=False)
+    for i, (train_index, test_index) in enumerate(folds.split(list(range(len(dataset))))):
+        train_dataset = dataset[train_index]
+        test_dataset = dataset[test_index]
+        pipe = create_pipeline(SGDClassifier())
+        pipe.fit(train_dataset['text'], train_dataset['label'])
+        y_pred = pipe.predict(test_dataset['text'])
+
+        metrics.update_metrics(test_dataset['label'], y_pred, False)
+
+    # train_save_SVM_for_clusterset_evaluation(label_set)
+    # metrics.save_scores_to_disk("diagnose_texts_with_SGD")
+
+    return metrics
+
+def cross_validate_label_corpus_with_simple_SVM(labels, path2corpus = "./database/bow_prepro_diag.pkl", sample = True):
+    """
+    trains a simple SVM with the given data
+    returns 10-fold-cross-validated accuracy value
+    """
+
+    texts = pd.read_pickle(path2corpus)
+    from database_preparation.utils_labeled_datasets import text_label_2_labeled_dataset
+
+    metrics = cls_metrics.ClassificationMetrics("temp")
+
+    #print("train SVM for cluster " + label_set + " with " + path2corpus + ".")
+
+    text_lst = pd.read_pickle(path2corpus)
+    text1 = np.asarray(text_lst[0])
+    corpus_is_tokenized = bool(text1.ndim)
+    del text1, text_lst
+
+    if corpus_is_tokenized:
+        dataset = text_label_2_labeled_dataset(texts,labels)
+    else:
+        dataset_raw = text_label_2_labeled_dataset(texts,labels)
+
+        # tokenize
+        tokenized_texts = []
+        for t_text in dataset_raw['text']:
+            tokenized_texts.append(nltk.tokenize.word_tokenize(t_text, language='german'))
+        df = pd.DataFrame({'text': tokenized_texts, 'label': dataset_raw['label']})
+        dataset = datasets.Dataset(pa.Table.from_pandas(df))
+
+    # 10-fold crosss validation:
+    folds = KFold(n_splits=10, shuffle=False)
+    for i, (train_index, test_index) in enumerate(folds.split(list(range(len(dataset))))):
+        train_dataset = dataset[train_index]
+        test_dataset = dataset[test_index]
+        pipe = create_pipeline(SGDClassifier())
+        pipe.fit(train_dataset['text'], train_dataset['label'])
+        y_pred = pipe.predict(test_dataset['text'])
+
+        metrics.update_metrics(test_dataset['label'], y_pred, False)
+        if sample:
+            return metrics.scores['accuracy']
+
+    # train_save_SVM_for_clusterset_evaluation(label_set)
+    # metrics.save_scores_to_disk("diagnose_texts_with_SGD")
+
+    return np.mean(metrics.scores['accuracy'])
+
+def train_SVM_with_clusterset(label_set, path2corpus = "./database/bow_prepro_diag.pkl", path2dfcases='./database/df_cases.pkl'):
+    """
+    trains ans saves a svm, trained with the whole data under as:
+    "./ModelTestingAndExplaining/models/SVM_trained_with_" + label_set + "_clustered.pkl"
+    """
+
+    print("train SVM for cluster " + label_set + " with " + path2corpus + ".")
+
+    text_lst = pd.read_pickle(path2corpus)
+    text1 = np.asarray(text_lst[0])
+    corpus_is_tokenized = bool(text1.ndim)
+    del text1, text_lst
+
+    if corpus_is_tokenized:
+        dataset = dt.text_label_files_to_labeled_dataset(label_set,
+                                                         path2corpus
+                                                         , path2dfcases, False)
+    else:
+
+        dataset_raw = dt.text_label_files_to_labeled_dataset(label_set,
+                                                             path2corpus
+                                                             , path2dfcases, False)
+        # tokenize
+        tokenized_texts = []
+        for t_text in dataset_raw['text']:
+            tokenized_texts.append(nltk.tokenize.word_tokenize(t_text, language='german'))
+        df = pd.DataFrame({'text': tokenized_texts, 'label': dataset_raw['label']})
+        dataset = datasets.Dataset(pa.Table.from_pandas(df))
+
+    pipe = create_pipeline(SVC(probability=True, kernel='linear'))
+    '''svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
+    pipe = make_pipeline(make_pipeline(
+        TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False),svd),
+        SVC(C=150, gamma=2e-2, probability=True))'''
+    pipe.fit(dataset['text'], dataset['label'])
+    path = "./ModelTestingAndExplaining/models/SVM_trained_with_" + label_set + "_clustered.pkl"
+    pickle.dump(pipe, open(path, 'wb'))
+
+
+def update_cls_metric(label_set, cls_accuracy):
+    file_name = label_set + "_Diagnosis"
+    file_name = file_name.replace('KMeans', 'kmeans')
+    file_name = file_name.replace('d2v', 'doc2vec')
+    file_path = "TextClustering/cluster_metrics/" + file_name + ".pkl"
+    try:
+        scores = pd.DataFrame(pd.read_pickle(file_path))
+    except:
+        return
+    if 'cls accuracy' in scores.index:
+        scores[file_name]['cls accuracy'] = cls_accuracy
+        new_scores = scores
+    else:
+        vals = list(scores[file_name])
+        new_index = scores.index.append(pd.Index(['cls accuracy']))
+        vals.append(cls_accuracy)
+        new_scores = pd.DataFrame({file_name: vals}, index=new_index)
+
+    new_scores.to_pickle(file_path)
+
+
+def update_cls_metric_for_each_clusterset():
+    '''
+    does 10-fold-cross-validation with a svm for each cluster-set saved in './database/df_cases.pkl'
+    using always the text in 'database/diag_lst_tokenized.pkl'
+    '''
+    label_sets = dt.get_all_label_set_ids()
+    # label_sets = ["German_BERT"]
+    for label_set in label_sets:
+        accuracy = np.mean(cross_validate_with_simple_SVM(label_set,
+                                                  'database/diag_lst_tokenized.pkl',
+                                                  './database/df_cases.pkl').scores['accuracy'])
+        print("svm-cls-accuracy of cluster set "+label_set+": "+str(accuracy))
+        update_cls_metric(label_set, accuracy)
+
+
+def main():
+    #update_cls_metric_for_each_clusterset()
+    cluster_set_name = "German_BERT"
+    #text_data = 'database/darmischaemie_prostata_txt_lst.pkl' cluster_set_dict = './database/df_cases2.pkl'
+    text_data  = 'database/diag_lst.pkl'
+    #text_data = 'database/diag_lst_tokenized.pkl'
+    cluster_set_dict = './database/df_cases.pkl'
+    train_SVM_with_clusterset(cluster_set_name, text_data, cluster_set_dict)
+
+
+if __name__ == '__main__':
+    main()
--- a/TextClassification/classification_metrics.py
+++ b/TextClassification/classification_metrics.py
+import time
+import json
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, cohen_kappa_score
+import numpy as np
+import os
+import sys
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import plot_confusion_matrix
+import seaborn as sn
+import pandas as pd
+import pickle
+
+sys.path.append(os.getcwd())
+
+class ClassificationMetrics(object):
+
+    def __init__(self, model_name, metrics_save_name="metrics_new", **kwargs):
+        self.scores = {
+            'name': model_name,
+            'fold_amount': 0,
+            'accuracy': [],
+            'precision': [],
+            'recall': [],
+            'f1': [],
+            'cohen_kappa': [],
+            'time': [],
+        }
+
+        self.y_preds = []
+        self.y_tests = []
+
+        # create classification-metrics folder if not exist:
+        if not os.path.isdir("./TextClassification/cls_metrics"):
+            os.makedirs("./TextClassification/cls_metrics")
+
+        # create subfolder for our metrics if not exist:
+        self.metrics_path = "./TextClassification/cls_metrics/"+metrics_save_name+"/"
+        if not os.path.isdir(self.metrics_path):
+            os.makedirs(self.metrics_path)
+
+        # save paths:
+        self.json_file_path = "none"
+        self.object_dir = "none"
+
+    def update_metrics(self, y_test, y_pred, print_cls_report=False, start_time=None):
+        '''
+        call this for each test run if you do k-fold-cross-validation
+        '''
+        if print_cls_report:
+            print(classification_report(y_test, y_pred))
+
+        self.y_preds.append(y_pred)
+        self.y_tests.append(y_test)
+
+        self.scores['fold_amount'] += 1
+
+        if start_time != None:
+            self.scores['time'].append(time.time() - start_time)
+        else:
+            self.scores['time'].append(-1)
+
+        self.scores['accuracy'].append(accuracy_score(y_test, y_pred))
+
+        # the ability of the classifier not to label as positive a sample that is negative - tp / (tp + fp)
+        # -> precision = 1 -> This class was detected perfectly. There are only TPs! (true positives)
+        # -> precision = 0.75 ->  There are sine false positives! -> sometimes the machine thought it was class A, but it wasn't
+        self.scores['precision'].append(precision_score(y_test, y_pred, average='weighted'))
+
+        # the ability of the classifier to find all the positive samples - tp / (tp + fn)
+        self.scores['recall'].append(recall_score(y_test, y_pred, average='weighted'))
+
+        self.scores['f1'].append(f1_score(y_test, y_pred, average='weighted'))
+
+        # cohen_kappa = fleiss kappa with 2 raters?
+        # The kappa score measures the degree of agreement between
+        # the two evaluators, also known as inter-rater reliability
+        self.scores['cohen_kappa'].append(cohen_kappa_score(y_test, y_pred))
+
+    def clean_class_score_table(self, df):
+        df.drop(['accuracy', 'macro avg', 'weighted avg'], 1, inplace=True)
+        df.drop(['precision', 'recall'], 0, inplace=True)
+        df = df.T
+        # round f1-values
+        for i, x in enumerate(df['f1-score']):
+            df['f1-score'][i] = round(x, 3)
+        # edit suport entries:
+        integer_support = [str(x)[:-2] for x in df['support']]
+        df['support'] = integer_support
+        df.sort_values(by=['f1-score'], inplace=True, ascending=False)
+        return df
+
+    def get_merged_predictions(self):
+        merged_y_tests = []
+        merged_y_preds = []
+        for i in range(0, len(self.y_tests)):
+            for x in self.y_preds[i]:
+                merged_y_preds.append(x)
+            for y in self.y_tests[i]:
+                merged_y_tests.append(y)
+        return merged_y_tests, merged_y_preds
+
+    def classes_scores(self, prediction_set=0):
+        '''
+        returns some scored for each class
+        '''
+        if prediction_set < 0:
+
+            merged_y_tests, merged_y_preds = self.get_merged_predictions()
+
+            dic = classification_report(merged_y_tests, merged_y_preds,
+                                        output_dict=True)
+            df = pd.DataFrame(dic)
+            return self.clean_class_score_table(df)
+
+        else:
+            dic = classification_report(self.y_tests[prediction_set], self.y_preds[prediction_set],
+                                           output_dict=True)
+            df = pd.DataFrame(dic)
+            return self.clean_class_score_table(df)
+
+
+    def plot_confusion_matrix(self, labels, prediction_set=0, plot=False, save=True,
+                              filename='confusion_matrix', title=None,
+                              normalized=True, annot = False, colormap='gray'):
+        if title == None:
+            title = filename
+        if prediction_set < 0:
+            y_test, y_pred = self.get_merged_predictions()
+        else:
+            y_test = self.y_tests[prediction_set]
+            y_pred = self.y_preds[prediction_set]
+        try:
+            conf_matrix = np.asarray(confusion_matrix(y_test , y_pred,labels=labels),dtype=float)
+
+            if normalized:
+                for y, row in enumerate(conf_matrix[:]):
+                    sum_apperiance = np.sum(row)
+                    for x, pred_amount in enumerate(row):
+                        if sum_apperiance == 0:
+                            row[x] = 0
+                        else:
+                            row[x] = round(pred_amount / sum_apperiance,2)
+                    conf_matrix[y] = row
+        except ValueError:
+            if labels[0] == 0:
+                labels2=['class'+str(a) for a in range(len(labels))]
+                try:
+                    conf_matrix = confusion_matrix(y_test, y_pred, labels=labels2)
+                except ValueError:
+                    print("confusion_matrix generation failed.")
+                    print("labels:")
+                    print(labels)
+                    print("y_test:")
+                    print(self.y_tests[prediction_set])
+                    print("y_pred:")
+                    print(self.y_preds[prediction_set])
+                    return
+            else:
+                print("confusion_matrix generation failed.")
+                print("labels:")
+                print(labels)
+                print("y_test:")
+                print(self.y_tests[prediction_set])
+                print("y_pred:")
+                print(self.y_preds[prediction_set])
+                return
+
+        #print(conf_matrix)
+        df_cm = pd.DataFrame(conf_matrix, labels, labels)
+        sn.set(font_scale=1.4)  # for label size
+
+        if plot or save:
+            plt.close()
+            if normalized:
+                hm = sn.heatmap(df_cm, annot=annot, vmin=0, vmax=1, cmap=colormap) # this makes problems???
+            else:
+                hm = sn.heatmap(df_cm, annot=annot, annot_kws={"size": 10}, cmap=colormap)
+
+            plt.xlabel("predicted", fontsize=14)
+            plt.ylabel("true", fontsize=14)
+            plt.title(title, fontsize=16)
+
+        if plot:
+            plt.show()
+        if save:
+            figure = hm.get_figure()
+            save_path = "TextClassification/plots/"+filename+".png"
+            try:
+                figure.savefig(save_path, dpi=300)
+                print("generated "+save_path)
+            except FileNotFoundError:
+                os.mkdir("TextClassification/plots")
+                figure.savefig(save_path, dpi=300)
+                print("generated " + save_path)
+
+    def save_scores_to_disk(self, labelset):
+        '''
+        if file already exists, we append the new score as new row
+        '''
+
+        #  save scores as table, appending if modelname already exist:
+        self.json_file_path = self.metrics_path+labelset+"_clustered_all_classifiers.json"
+        if os.path.isfile(self.json_file_path):
+            # add number to name, if model appears already in json file:
+            with open(self.json_file_path, 'r') as f:
+                amount_same_name = 0
+                for line in f:
+                    scores = json.loads(line)
+                    if self.scores['name'] in scores["name"]:
+                        amount_same_name += 1
+                if amount_same_name > 0:
+                    self.scores['name']=self.scores['name']+"_"+str(amount_same_name+1)
+
+        with open(self.json_file_path, 'a') as f:
+            f.write(json.dumps(self.scores) + "\n")
+
+    def pickle_object(self, labelset, model_name='default'):
+        # pickles whole object
+
+        if model_name == 'default':
+            model_name = self.scores['name']
+        self.object_dir = self.metrics_path + labelset + "_clustered_" + model_name + "_classified.pickle"
+        with open(self.object_dir, 'wb') as f:
+            pickle.dump(self, f)
+
+
+def print_results_as_latextable(jsonfile, print_only_f1_kappa=True):
+    '''
+    returns the results as latex table.
+    expecting a jsonfile (path to json file) as saved my the metrics object
+     you can optain the jsonfile of a metrics object via metrics.json_file_path
+    '''
+
+    if print_only_f1_kappa:
+        print("================== " + jsonfile + " ==================")
+        fields = [key for key in ClassificationMetrics(None).scores.keys()]
+        to_remove = ["fold_amount","accuracy","precision","recall"]
+        for remove in to_remove:
+            fields.remove(remove)
+        table = []
+        with open(jsonfile, 'r') as f:
+            for idx, line in enumerate(f):
+                scores = json.loads(line)
+                row = [scores['name']]
+
+                for field in fields[1:]:
+                    row.append("{:0.3f}".format(np.mean(scores[field])))
+
+                table.append(row)
+
+        # sort over f1 score:
+        table.sort(key=lambda r: r[1], reverse=True)
+        # print(tabulate.tabulate(table, headers=fields))
+    else:
+        print("================== " + jsonfile + " ==================")
+        fields = [key for key in ClassificationMetrics(None).scores.keys()]
+        table = []
+        with open(jsonfile, 'r') as f:
+            for idx, line in enumerate(f):
+                scores = json.loads(line)
+                row = [scores['name'], scores['fold_amount']]
+
+                for field in fields[2:]:
+                    row.append("{:0.3f}".format(np.mean(scores[field])))
+
+                table.append(row)
+
+        # sort over f1 score:
+        table.sort(key=lambda r: r[5], reverse=True)
+        # print(tabulate.tabulate(table, headers=fields))
+
+
+    # export it to df and than to latex table:
+    df = pd.DataFrame(columns=fields)
+    for i, field in enumerate(fields):
+        # df.append()
+        df[field] = [e[i] for e in table]
+
+    df.drop(columns=['time'], axis=1, inplace=True)
+    as_latex = df.to_latex(index=False)
+    print(as_latex)
+
+    return as_latex
+
+
+def main():
+    y_true = [0,1,0,1,0,2]
+    y_pred = [1,1,0,1,0,2]
+
+    metrics = ClassificationMetrics("metrics_test")
+    metrics.update_metrics(y_true,y_pred)
+    metrics.save_scores_to_disk("testitest")
+    metrics.pickle_object("testitest")
+    metrics.plot_confusion_matrix([i for i in range(3)],0,True,True)
+
+if __name__ == "__main__":
+    main()
+
+
--- a/TextClassification/classification_pipeline.py
+++ b/TextClassification/classification_pipeline.py
+
+import os
+
+####### pieline parameters ################
+#cluster_sets = ['KMeans', 'LDA', 'HDBSCAN', 'GSDPMM', 'German_BERT', 'Patho_BERT', 'top2vec']
+cluster_sets = ['HDBSCAN']
+
+# params:
+path2corpus_bow_preprocessed = 'database/bow_prepro_desc.pkl'
+path2corpus_embedding_preprocessed = 'database/embedding_prepro_desc.pkl'
+
+
+#check working directory:
+workdir = os.getcwd()
+if not workdir[-len('nlp-in-diagnostic-texts-from-nephropathology'):] == 'nlp-in-diagnostic-texts-from-nephropathology':
+    print(workdir + " is the wrong working directory.")
+    print("please make shure to run this script with working directory '.../path/to/nlp-in-diagnostic-texts-from-nephropathology'.")
+    exit(1)
+
+for cluster_set in cluster_sets:
+    script_queue = [
+        f"python TextClassification/bow_classification.py --clustered_data {cluster_set} --path2corpus {path2corpus_bow_preprocessed}",
+        f"python TextClassification/RNN_classification.py --clustered_data {cluster_set} --path2corpus {path2corpus_embedding_preprocessed}",
+        f"python TextClassification/CNN_classification.py --clustered_data {cluster_set} --path2corpus {path2corpus_embedding_preprocessed}",
+        #f"python TextClassification/bert_classification.py --clustered_data {cluster_set} --path2corpus {path2corpus_embedding_preprocessed}",
+        f"python TextClassification/print_classification_metrics.py --clustered_data {cluster_set}"
+    ]
+
+    for script in script_queue:
+        print("\n########################################### executing ###########################################")
+        print(script)
+        print("####################################################################################################\n")
+        os.system(script)
\ No newline at end of file
--- a/TextClassification/plot_loss_curves.py
+++ b/TextClassification/plot_loss_curves.py
+import matplotlib.pyplot as plt
+import math
+import json
+import argparse
+
+def plot_loss_curve(path2json, title='loss'):
+    with open(path2json) as f:
+        log_history = json.load(f)["log_history"]
+
+    # Keep track of train and evaluate loss.
+    loss_history = {'train_loss': [], 'eval_loss': [],
+                    'train_steps': [], 'train_epochs': [],
+                    'eval_steps': [], 'eval_epochs': []}
+
+    # Keep track of train and evaluate perplexity.
+    # This is a metric useful to track for language models.
+    perplexity_history = {'train_perplexity': [], 'eval_perplexity': []}
+
+    for log in log_history:
+        if 'loss' in log.keys():
+            # Deal with trianing loss.
+            loss_history['train_loss'].append(log['loss'])
+            perplexity_history['train_perplexity'].append(math.exp(log['loss']))
+            loss_history['train_epochs'].append(log["epoch"])
+            loss_history['train_steps'].append(log["step"])
+
+        elif 'eval_loss' in log.keys():
+            # Deal with eval loss.
+            loss_history['eval_loss'].append(log['eval_loss'])
+            perplexity_history['eval_perplexity'].append(math.exp(log['eval_loss']))
+            loss_history['eval_epochs'].append(log["epoch"])
+            loss_history['eval_steps'].append(log["step"])
+
+    # Plot Losses.
+    plt.figure()
+    plt.plot(loss_history['eval_epochs'], loss_history["eval_loss"],
+             label="eval loss")
+    plt.plot(loss_history['train_epochs'], loss_history["train_loss"],
+             label="train loss")
+    plt.xlabel("epoch", fontsize=14)
+    plt.ylabel("loss", fontsize=14)
+    plt.title(title, fontsize=16)
+    plt.grid(True)
+    plt.legend()
+    plt.show()
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--path_to_trainer_state_file",
+                        default='./LanguageModelling/ger-patho-bert-w3/trainer_state.json')
+    args = parser.parse_args()
+
+    # example how to plot loss curve:
+    plot_loss_curve(args.path_to_trainer_state_file,
+                    args.path_to_trainer_state_file.replace('/trainer_state.json',''))
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/TextClassification/print_classification_metrics.py
+++ b/TextClassification/print_classification_metrics.py
+import TextClassification.classification_metrics as cls_metrics
+import glob
+import sys, os
+from TextClassification.argsparse_classification_preamble import argsparse_preamble
+import pickle
+import database_preparation.utils_labeled_datasets as dt
+sys.path.append(os.getcwd())
+
+# script parameters:
+metrics_folder = "cls_metrics/metrics_new"
+
+
+def generate_save_conf_matrix(model_name, clustered_data,
+                              test_set_index):
+    ob_dir = "./TextClassification/" + metrics_folder + "/" \
+             + clustered_data + "_clustered_" + model_name + "_classified.pickle"
+
+    try:
+        with open(ob_dir, 'rb') as f:
+            metrics = pickle.load(f)
+    except FileNotFoundError:
+        return False
+
+    # plot confusion matrix
+    if "ger-patho-bert" in model_name:
+        titlename = "Patho-BERT"
+    elif "german" in model_name:
+        titlename = "German-BERT"
+    elif "SGD" in model_name:
+        titlename = "SGD-classifier"
+    elif "MLP" in model_name:
+        titlename = "MLP-classifier"
+    else:
+        titlename = model_name
+    unique_labels = dt.get_amount_unique_labels(clustered_data)
+    labels = [a for a in range(unique_labels)]
+    # labels=['class'+str(a) for a in range(unique_labels)]
+    # https://matplotlib.org/3.5.1/tutorials/colors/colormaps.html
+    metrics.plot_confusion_matrix(labels, prediction_set=test_set_index,
+                                  plot=False, save=True,
+                                  filename="confmatrx_"+clustered_data + "_clustered_" + model_name + "_classified",
+                                  title= titlename, normalized=True, annot = False, colormap='gist_heat')
+    return True
+
+def print_f1_per_clusters(model_name, clustered_data):
+    # print f1-scores for each class of a single test run
+    ob_dir = "./TextClassification/" + metrics_folder + "/" \
+             + clustered_data + "_clustered_" + model_name + "_classified.pickle"
+    try:
+        with open(ob_dir, 'rb') as f:
+            metrics = pickle.load(f)
+    except FileNotFoundError:
+        return False
+    print("================ model: " + model_name + " | cluster-set: " + clustered_data + " ================")
+    df = metrics.classes_scores(-1)
+    print(df.to_latex().replace('{}', 'cluster'))
+
+    return True
+
+def main():
+    args = argsparse_preamble()
+
+    model_names = ['SGDClassifier', 'MLPClassifier', 'ger-patho-bert-2', 'bert-base-german-cased',
+                   'CNN', 'RNN', 'LogisticRegression', 'MultinomialNB']
+
+    # print f1 scores for each classification model:
+    for model in model_names:
+        print_f1_per_clusters(model, args.clustered_data)
+
+    # print classification overview tables for each clustering method:
+    print()
+    file_list = glob.glob("./TextClassification/" + metrics_folder + "/" + '/*.json')
+    for file in file_list:
+        cls_metrics.print_results_as_latextable(file, True)
+        print()
+
+    # generate all confusion matrices for each classification model:
+    for model in model_names:
+        try:
+            generate_save_conf_matrix(model, args.clustered_data, -1)
+        except:
+            print(f"cant generate conf matrix for {model}")
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/TextClustering/argsparse_clustering_preamble.py
+++ b/TextClustering/argsparse_clustering_preamble.py
+import argparse
+import sys, os
+def argsparse_preamble():
+
+    sys.path.append(os.getcwd())
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--find_k_value", action='store_true')
+    parser.add_argument("--k_value", type=int, default=10)
+    parser.add_argument("--show_figures", action='store_true')
+    parser.add_argument("--model2use", default="German_BERT")
+    parser.add_argument('--do_embedding', action='store_true')
+    parser.add_argument("--path2corpus", default='database/bow_prepro_diag.pkl')
+    parser.add_argument("--df_cases_file", default='database/df_cases.pkl')
+
+    args = parser.parse_args()
+
+    return args
+
+#%%
--- a/TextClustering/basedOn_BOW/GSDPMM.py
+++ b/TextClustering/basedOn_BOW/GSDPMM.py
+from __future__ import unicode_literals, print_function, division
+import csv
+
+import numpy as np
+
+class GSDPMM:
+    def __init__(self, K, alpha, beta, iterNum, dataset):
+        self.K=K
+        self.alpha=alpha
+        self.beta=beta
+        self.iterNum=iterNum
+        self.dataset=dataset
+        self.docu_set=docu_set(self.dataset)
+        self.docu_num=self.docu_set.docu_num
+        self.V=self.docu_set.V
+        self.alpha0=K*self.alpha
+        self.beta0=self.V*beta
+        self.m_z=np.zeros(K,dtype=np.int)
+        self.n_z=np.zeros(K,dtype=np.int)
+        self.n_zv=np.zeros([K,self.V],dtype=np.int)
+        self.z_c=np.zeros(self.docu_num,dtype=np.int)
+        self.num_list=self.docu_set.num_list
+        self.wordid_array=self.docu_set.wordid_array
+        self.wordfreq_array=self.docu_set.wordfreq_array
+        self.largedouble=1e100
+        self.smalldouble=1e-100
+        
+        
+        
+    def initialize(self):
+        
+        for d in range(self.docu_num):
+            self.z_c[d]=int(np.floor(self.K*np.random.uniform()))
+            cluster=self.z_c[d]
+            self.m_z[cluster]=self.m_z[cluster]+1
+            for w in range(len(self.num_list[d])):
+                self.n_zv[cluster][self.num_list[d][w]]=self.n_zv[cluster][self.num_list[d][w]]+1
+                self.n_z[cluster]=self.n_z[cluster]+1
+        
+    def gibbs_sampling(self):
+        
+        for i in range(self.iterNum):
+            for d in range(self.docu_num):
+                cluster=self.z_c[d]
+                self.m_z[cluster]=self.m_z[cluster]-1
+                for w in range(len(self.num_list[d])):
+                    self.n_zv[cluster][self.num_list[d][w]]=self.n_zv[cluster][self.num_list[d][w]]-1
+                    self.n_z[cluster]=self.n_z[cluster]-1
+                
+                cluster=self.sample_cluster(d)
+                self.z_c[d]=cluster
+                self.m_z[cluster]=self.m_z[cluster]+1
+                for w in range(len(self.num_list[d])):
+                    self.n_zv[cluster][self.num_list[d][w]]=self.n_zv[cluster][self.num_list[d][w]]+1
+                    self.n_z[cluster]=self.n_z[cluster]+1
+                
+                
+            print(f'iteration {i}/{self.iterNum}')
+        
+
+    def sample_cluster(self, d):
+        prob=np.zeros(self.K)
+        overflow_count=np.zeros(self.K)
+        for k in range(self.K):
+            prob[k]=(self.m_z[k]+self.alpha)/(self.docu_num+self.alpha0)
+            value2=1.0
+            i=0
+            for w in range(len(self.wordid_array[d])):
+                wordNo=self.wordid_array[d][w]
+                wordfreq=self.wordfreq_array[d][w]
+                for j in range(wordfreq):
+                    value2=value2*(self.n_zv[k][wordNo]+self.beta+j)/(self.n_z[k]+self.beta0+i)
+                    i=i+1
+                
+                    if value2<self.smalldouble:
+                        overflow_count[k]=overflow_count[k]-1
+                        value2=value2*self.largedouble
+                        
+            prob[k]=prob[k]*value2
+            
+        self.recompute_prob(prob, overflow_count, self.K)
+        
+        for k in range(1,self.K):
+            prob[k]=prob[k-1]+prob[k]
+        
+        sample=np.random.uniform()*prob[self.K-1]
+        kchoosed=0
+        for kchoosed in range(self.K):
+            if sample<prob[kchoosed]:
+                break
+        
+        return kchoosed
+                
+                
+    def recompute_prob(self, prob, overflow_count, K):
+        max_common=-1e20
+        for k in range(K):
+            if overflow_count[k]>max_common and prob[k]>0:
+                max_common=overflow_count[k]
+        
+        for k in range(K):
+            if prob[k]>0:
+                prob[k]=prob[k]*pow(self.largedouble,overflow_count[k]-max_common)
+
+
+class docu_set:
+    def __init__(self, dataset):
+        self.docu_num=0
+        self.docs=[]
+        self.result=self.read_data(dataset)
+        self.lines=self.result[0]
+        self.wordtoId={}
+        self.wordfreq={}
+        self.V=len(self.wordtoId)
+        self.num_list, self.wordid_array, self.wordfreq_array=self.convert_to_numlist()
+        
+
+
+        
+    def read_data(self,filename):
+        data=[]
+        target=[]
+        with open(filename,'r') as csvfile:
+            line_reader=csv.reader(csvfile)
+            for line in line_reader:
+                data.append(line[2])
+                #target.append(line[3])
+            self.docu_num=len(data)
+            print(len(data))
+        
+        return [data,target]
+        
+
+
+    def convert_to_numlist(self):
+        n_lines=len(self.lines)
+        num_list=[[] for i in range(n_lines)]
+        wordid_array=[[] for i in range(n_lines)]
+        wordfreq_array=[[] for i in range(n_lines)]
+        
+        for i in range(n_lines):
+            this_line=self.lines[i]
+            split_line=this_line.split()
+            for j in range(len(split_line)):
+                if split_line[j] in self.wordtoId:
+                    self.wordfreq[self.wordtoId[split_line[j]]]=self.wordfreq[self.wordtoId[split_line[j]]]+1 
+                    Id=self.wordtoId.get(split_line[j])
+                    if Id in wordid_array[i]:
+                        wordfreq_array[i][wordid_array[i].index(Id)]+=1
+                    else:
+                        wordid_array[i].append(Id)
+                        wordfreq_array[i].append(1)
+                        
+                else:
+                    self.wordtoId[split_line[j]]=self.V
+                    self.V=self.V+1
+                    self.wordfreq[self.wordtoId[split_line[j]]]=1
+                    Id=self.wordtoId.get(split_line[j])
+                    if Id in wordid_array[i]:
+                        wordfreq_array[i][wordid_array[i].index(Id)]+=1
+                    else:
+                        wordid_array[i].append(Id)
+                        wordfreq_array[i].append(1)
+                
+                
+                num_list[i].append(self.wordtoId[split_line[j]])
+    
+        return num_list, wordid_array, wordfreq_array
+
+
+
+
+
+
+
--- a/TextClustering/basedOn_BOW/GSDPMM_Diagnosis.py
+++ b/TextClustering/basedOn_BOW/GSDPMM_Diagnosis.py
+
+#%% argsparse section
+import sys, os
+
+sys.path.append(os.getcwd())
+
+from TextClustering.argsparse_clustering_preamble import argsparse_preamble
+args = argsparse_preamble()
+
+from database_preparation.utils_labeled_datasets import is_text_lst_tokenized
+
+#%% import section
+import pickle
+from TextClustering.basedOn_BOW.GSDPMM import *
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+import matplotlib.pyplot as plt
+import umap
+from tqdm import tqdm
+from TextClustering.utils_metrics import ClusterMetrics
+from database_preparation.preprocess import print_meta_data
+from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
+
+#%% load the data
+with open(args.path2corpus, 'rb') as f:
+    diag_lst = pickle.load(f)
+
+print_meta_data(args.path2corpus)
+
+#%% and save it for DPMM
+text = ['text'] * len(diag_lst)
+if is_text_lst_tokenized(args.path2corpus):
+    text_tupls = list(zip(text, [' '.join(tokenized_text) for tokenized_text in diag_lst]))
+else:
+    text_tupls = list(zip(text, [text for text in diag_lst]))
+df = pd.DataFrame(text_tupls)
+df.to_csv('TextClustering/basedOn_BOW/temp.csv', header=None)
+
+def identity(word):
+    return word
+
+def create_vectorizer(data):
+    vec = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
+    return vec.fit_transform(data)
+
+text_features = create_vectorizer(diag_lst)
+
+#%% find the best hyperparameter
+if args.find_k_value:
+
+    # %% set the parameter
+    args.alpha = 0.3
+    args.beta = 0.02
+    args.iterNum = 5
+    args.dataset = 'TextClustering/basedOn_BOW/temp.csv'
+
+    beta_list = np.arange(3,23,1)
+    s_score, n_cluster, svm_scores = [], [], []
+    n_steps = []
+    for i in tqdm(beta_list):
+
+        #%% initialize it
+        gsdmm = GSDPMM(i,
+                       args.alpha, args.beta,
+                       args.iterNum,
+                       args.dataset)
+        gsdmm.initialize()
+
+        # %% actually do it
+        gsdmm.gibbs_sampling()
+
+        #%% evalute the model
+        evaluation = ClusterMetrics(text_features, gsdmm.z_c)
+        s_score.append(evaluation.s_score)
+        svm_scores.append(
+            cross_validate_label_corpus_with_simple_SVM(gsdmm.z_c, args.path2corpus + '.pkl',
+                                                        False))
+        n_cluster.append(len(np.unique(gsdmm.z_c)))
+        n_steps.append(i)
+
+
+
+    #%% plot it
+    fig, ax1 = plt.subplots()
+    ax2 = ax1.twinx()
+    ax3 = ax1.twinx()
+    ax1.plot(n_steps, s_score, 'bx-')
+    ax2.plot(n_steps, n_cluster, 'rx-')
+    ax3.plot(n_steps, svm_scores, 'gx-')
+    ax1.set_xlabel('Minimal cluster size')
+    ax1.yaxis.label.set_color('blue')
+    ax1.set_ylabel('s-score')
+    ax2.yaxis.label.set_color('red')
+    ax2.set_ylabel('Number of clusters')
+    ax3.yaxis.label.set_color('green')
+    ax3.set_ylabel('svm accuracy')
+    plt.title('Ellbow-method-like plot')
+    plt.savefig("TextClustering/plots/elbow_method/GSDPMM_elbow_plot.png", dpi=300)
+    plt.show()
+    sys.exit()
+
+#%% set the parameter
+args.alpha = 0.3
+args.beta = 0.02
+args.iterNum = 5
+args.dataset = 'TextClustering/basedOn_BOW/temp.csv'
+
+gsdmm=GSDPMM(args.k_value,
+             args.alpha, args.beta,
+             args.iterNum,
+             args.dataset)
+gsdmm.initialize()
+
+#%% actually do it
+gsdmm.gibbs_sampling()
+
+#%% retrieve the results
+A=gsdmm.z_c
+num_list=gsdmm.num_list
+m_z=gsdmm.m_z
+n_z=gsdmm.n_z
+n_zv=gsdmm.n_zv
+docu_num=gsdmm.docu_num
+predictedCluster=gsdmm.z_c
+wordid_array=gsdmm.wordid_array
+wordfreq_array=gsdmm.wordfreq_array
+
+#%% save UMAP data points:
+umap_text_features2D = umap.UMAP(n_neighbors=15,
+                                 n_components=2,
+                                 min_dist=0.0, metric='cosine').fit_transform(text_features)
+# save umaped vectors and labels:
+df = pd.read_pickle(args.df_cases_file)
+df['umapX_GSDPMM'] = umap_text_features2D[:, 0]
+df['umapY_GSDPMM'] = umap_text_features2D[:, 1]
+df['label_GSDPMM'] = predictedCluster
+df.to_pickle(args.df_cases_file)
+
+#%% evalute the model
+from TextClustering.utils_metrics import ClusterMetrics
+evaluation = ClusterMetrics(text_features, predictedCluster,
+                            file_name= "TextClustering/cluster_metrics/GSDPMM_metrics.pkl")
+
+evaluation.write_to_file()
+
+
--- a/TextClustering/basedOn_BOW/HDBSCAN_Diagnosis.py
+++ b/TextClustering/basedOn_BOW/HDBSCAN_Diagnosis.py
+
+#%% argsparse section
+import sys, os
+sys.path.append(os.getcwd())
+
+from TextClustering.argsparse_clustering_preamble import argsparse_preamble
+args = argsparse_preamble()
+
+from database_preparation.utils_labeled_datasets import is_text_lst_tokenized, is_text_lst_tfidf_vectorized
+import pickle
+from sklearn.feature_extraction.text import TfidfVectorizer
+import matplotlib.pyplot as plt
+import pandas as pd
+import umap
+import hdbscan
+import numpy as np
+from nltk import RegexpTokenizer
+from TextClustering.utils_metrics import ClusterMetrics
+from database_preparation.preprocess import print_meta_data
+from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
+
+
+tokenizer = RegexpTokenizer(r'\w+')
+
+#%% load the data
+with open(args.path2corpus, 'rb') as f:
+    diag_lst = pickle.load(f)
+
+print_meta_data(args.path2corpus)
+
+def identity(word):
+    return word
+
+text_is_vectorized = is_text_lst_tfidf_vectorized(args.path2corpus)
+if not (is_text_lst_tokenized(args.path2corpus) or text_is_vectorized):
+    print("Error: " + args.path2corpus + ' has to be a list of tokenized texts or tfidf-vectorized texts!')
+    exit(1)
+
+def create_vectorizer(data):
+    vec = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
+    vec = vec.fit_transform(data)
+    return vec
+
+if text_is_vectorized:
+    text_features = diag_lst
+else:
+    text_features = create_vectorizer(diag_lst)
+
+#%% perform umap for dimension-reduction (for cluster-detection)
+umap_text_features = umap.UMAP(n_neighbors=15,
+                            n_components=5,
+                            metric='cosine').fit_transform(text_features)
+
+# and perform umap-dimension-reduction for visulatizaton
+umap_text_features2D = umap.UMAP(n_neighbors=15,
+                                 n_components=2,
+                                 min_dist=0.0, metric='cosine').fit_transform(text_features)
+
+if args.find_k_value:
+    # %% perform hdbscan for cluster-dectection with different cluster sizes to find a good solution...by eye..
+    list_cluster_size = [int(k) for k in np.arange(3, 23, 1)]
+    s_score, n_cluster, svm_scores = [], [], []
+    for i_cluster_size in list_cluster_size:
+
+        cluster = hdbscan.HDBSCAN(min_cluster_size=i_cluster_size,
+                                  metric='euclidean',
+                                  cluster_selection_method='eom').fit(umap_text_features)
+
+
+        result = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
+        result['labels'] = cluster.labels_.tolist() # cluster.labels_
+
+        print(np.unique(result.labels))
+
+        #%% Visualize clusters
+        outliers = result.loc[result.labels == -1, :]
+        clustered = result.loc[result.labels != -1, :]
+        clustered['labels'] = [str(i) for i in clustered['labels']]
+
+        evaluation = ClusterMetrics(umap_text_features, cluster.labels_.tolist())
+        s_score.append(evaluation.s_score)
+        svm_scores.append(
+            cross_validate_label_corpus_with_simple_SVM(cluster.labels_.tolist(), args.path2corpus + '.pkl',
+                                                        False))
+        n_cluster.append(len(np.unique(cluster.labels_.tolist())))
+
+    fig, ax1 = plt.subplots()
+    ax2 = ax1.twinx()
+    ax3 = ax1.twinx()
+    ax1.plot(list_cluster_size, s_score, 'bx-')
+    ax2.plot(list_cluster_size, n_cluster, 'rx-')
+    ax3.plot(list_cluster_size, svm_scores, 'gx-')
+    ax1.set_xlabel('Minimal cluster size')
+    ax1.yaxis.label.set_color('blue')
+    ax1.set_ylabel('s-score')
+    ax2.yaxis.label.set_color('red')
+    ax2.set_ylabel('Number of clusters')
+    ax3.yaxis.label.set_color('green')
+    ax3.set_ylabel('svm accuracy')
+    plt.title('Ellbow-method-like plot')
+    plt.savefig("TextClustering/plots/elbow_method/HDBSCAN_elbow_plot.png", dpi=300)
+    plt.show()
+
+    exit()
+
+#%% perform hdbscan with best cluster size
+cluster = hdbscan.HDBSCAN(min_cluster_size=args.k_value,
+                              metric='euclidean',
+                              cluster_selection_method='eom').fit(umap_text_features)
+
+
+result = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
+result['labels'] = cluster.labels_.tolist() # cluster.labels_
+clusters = np.int8([str(i) for i in result['labels']])
+outliers = result.loc[result.labels == -1, :]
+clusters_no_outliers = result.loc[result.labels != -1, :]
+
+unique_clusters = np.unique(result.labels)
+print(f"\nfound {len(unique_clusters[unique_clusters>-1])} clusters.\n")
+
+# save umaped vectors:
+df = pd.read_pickle(args.df_cases_file)
+df['umapX_HDBSCAN'] = umap_text_features2D[:, 0]
+df['umapY_HDBSCAN'] = umap_text_features2D[:, 1]
+df['label_HDBSCAN'] = clusters
+df.to_pickle(args.df_cases_file)
+
+
+#%% and evaluate the results with several metrics (not needing ground truth)
+evaluation = ClusterMetrics(umap_text_features[result.labels >= 0,], clusters_no_outliers.labels.tolist(),
+                            file_name= "TextClustering/cluster_metrics/HDBSCAN_metrics.pkl")
+evaluation.write_to_file()
--- a/TextClustering/basedOn_BOW/LDA_Diagnosis.py
+++ b/TextClustering/basedOn_BOW/LDA_Diagnosis.py
+
+#%% argsparse section
+import sys, os
+sys.path.append(os.getcwd())
+
+from TextClustering.argsparse_clustering_preamble import argsparse_preamble
+args = argsparse_preamble()
+
+
+from database_preparation.utils_labeled_datasets import is_text_lst_tokenized
+if not is_text_lst_tokenized(args.path2corpus):
+    print("Error: "+args.path2corpus + '.pkl is not tokenized! '
+            'Please pass texts list where each text is tokenized (a list of words).')
+    exit(1)
+
+#%% import section
+import pickle
+import gensim
+import gensim.corpora as corpora
+from gensim.models import CoherenceModel
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from gensim.models import CoherenceModel
+from tqdm import tqdm
+from database_preparation.preprocess import print_meta_data
+from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
+
+#%% load the diag and main_diag list
+with open(args.path2corpus, 'rb') as f:
+    diag_lst = pickle.load(f)
+
+print_meta_data(args.path2corpus)
+
+#%% prepare database_preparation for LDA-model-trainng
+# Creates, which is a mapping of word IDs to words.
+words = corpora.Dictionary(diag_lst)
+
+# Turns each document into a bag of words.
+corpus = [words.doc2bow(doc) for doc in diag_lst] #is that allready a model
+
+#%% train LDA-model with different number of clusters
+if args.find_k_value:
+    limit=21; start=5; step=1
+    coherence_values = []
+    model_list, n_cluster, svm_scores = [], [], []
+
+    for num_topics in tqdm(range(start, limit, step)):
+
+        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
+                                                    id2word=words,
+                                                    num_topics=num_topics,
+                                                    random_state=5,
+                                                    update_every=1,
+                                                    passes=10,
+                                                    alpha='auto',
+                                                    per_word_topics=True)
+
+        coherencemodel = CoherenceModel(model=lda_model, texts=diag_lst, dictionary=words,
+                                        coherence='c_v', processes= 1)
+        coherence_values.append(coherencemodel.get_coherence())
+
+        topic_weights = []
+        for i, row_list in enumerate(lda_model[corpus]):
+            topic_weights.append([w for i, w in row_list[0]])
+        predictedCluster = np.argmax(pd.DataFrame(topic_weights).fillna(0).values, axis=1)
+        svm_scores.append(
+            cross_validate_label_corpus_with_simple_SVM(predictedCluster, args.path2corpus,
+                                                        False))
+
+        #n_cluster.append(len(lda_model.print_topics(num_words=3)))
+        n_cluster.append(len(np.unique(np.asarray(predictedCluster))))
+        print("coherence: " + str(coherencemodel.get_coherence()))
+
+    #%% visualize the results
+    x = range(start, limit, step)
+    fig, ax1 = plt.subplots()
+    ax2 = ax1.twinx()
+    ax3 = ax1.twinx()
+    ax1.plot(x, coherence_values,'bx-')
+    ax2.plot(x, n_cluster, 'rx-')
+    ax3.plot(x, svm_scores, 'gx-')
+    ax1.set_xlabel('Minimal cluster size')
+    ax1.yaxis.label.set_color('blue')
+    ax1.set_ylabel('Coherence score')
+    ax2.yaxis.label.set_color('red')
+    ax2.set_ylabel('Number of clusters')
+    ax3.yaxis.label.set_color('green')
+    ax3.set_ylabel('svm accuracy')
+    plt.title('Ellbow-method-like plot')
+    plt.savefig("TextClustering/plots/elbow_method/LDA_elbow_plot.png", dpi=300)
+    plt.show()
+    exit()
+
+#%% train LDA-model
+lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
+                                           id2word=words,
+                                           num_topics=args.k_value,
+                                           random_state=5,
+                                           update_every=1,
+                                           passes=10,
+                                           alpha='auto',
+                                           per_word_topics=True)
+
+
+#%% get topic weights / features
+topic_weights = []
+for i, row_list in enumerate(lda_model[corpus]):
+    topic_weights.append([w for i, w in row_list[0]])
+
+# Array of topic weights
+text_features = pd.DataFrame(topic_weights).fillna(0).values
+
+#%% get prediction
+predictedCluster= np.argmax(text_features, axis=1)
+
+# and add it to the dataframe
+df = pd.read_pickle(args.df_cases_file)
+df['label_LDA'] = predictedCluster
+
+
+from sklearn.decomposition import PCA
+pca = PCA(n_components=2)
+reduced_features = pca.fit_transform(text_features)
+df['pcaX_LDA'] = reduced_features[:, 0]
+df['pcaY_LDA'] = reduced_features[:, 1]
+
+
+#%% and with umap
+import umap
+umap_text_features2D = umap.UMAP(n_neighbors=15,
+                                     n_components=2,
+                                     min_dist=0.0, metric='cosine').fit_transform(text_features)
+
+df['umapX_LDA'] = umap_text_features2D[:, 0]
+df['umapY_LDA'] = umap_text_features2D[:, 1]
+df.to_pickle(args.df_cases_file)
+
+#%% evalute the model
+from TextClustering.utils_metrics import ClusterMetrics
+evaluation = ClusterMetrics(text_features, predictedCluster,
+                            file_name= "TextClustering/cluster_metrics/LDA_metrics.pkl")
+
+evaluation.write_to_file()
--- a/TextClustering/basedOn_BOW/kmeans_Diagnosis.py
+++ b/TextClustering/basedOn_BOW/kmeans_Diagnosis.py
+# %% import section
+import pickle
+from sklearn.feature_extraction.text import TfidfVectorizer
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+import pandas as pd
+from database_preparation.preprocess import print_meta_data
+from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
+import umap
+from database_preparation.utils_stringpreparation import get_most_frequent_words
+import numpy as np
+from database_preparation.utils_labeled_datasets import is_text_lst_tokenized, is_text_lst_tfidf_vectorized
+from TextClustering.argsparse_clustering_preamble import argsparse_preamble
+import os
+
+args = argsparse_preamble()
+
+plot_real_diagnosis = False
+
+def identity(word):
+    return word
+
+# %% load the data
+with open(args.path2corpus, 'rb') as f:
+    diag_lst = pickle.load(f)
+
+text_is_vectorized = is_text_lst_tfidf_vectorized(args.path2corpus)
+if not (is_text_lst_tokenized(args.path2corpus) or text_is_vectorized):
+    print("Error: " + args.path2corpus + ' has to be a list of tokenized texts or tfidf-vectorized texts!')
+    exit(1)
+
+print_meta_data(args.path2corpus)
+
+def create_vectorizer(data):
+    vec = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
+    vec = vec.fit_transform(data)
+    return vec
+
+
+if text_is_vectorized:
+    text_features = diag_lst
+else:
+    text_features = create_vectorizer(diag_lst)
+
+
+# %% perform elbow-method to find good cluster number
+if args.find_k_value:
+    Sum_of_squared_distances, svm_values = [], []
+    K = range(2, 23, 1)
+    for k in K:
+        print("iteration #" + str(k))
+        km = KMeans(n_clusters=k, max_iter=200, n_init=10)
+        km = km.fit(text_features)
+        predictedCluster_text_features = km.predict(text_features)
+        Sum_of_squared_distances.append(km.inertia_)
+        svm_values.append(
+            cross_validate_label_corpus_with_simple_SVM(predictedCluster_text_features, args.path2corpus, False))
+
+    fig, ax1 = plt.subplots()
+    #ax2 = ax1.twinx()
+    ax3 = ax1.twinx()
+    ax1.plot(K, Sum_of_squared_distances, 'bx-')
+    #ax2.plot(K, svm_values, 'rx-')
+    ax3.plot(K, svm_values, 'gx-')
+    ax1.set_xlabel('K')
+    ax1.yaxis.label.set_color('blue')
+    ax1.set_ylabel('Sum_of_squared_distances')
+    #ax2.yaxis.label.set_color('red')
+    #ax2.set_ylabel('Number of clusters')
+    ax3.yaxis.label.set_color('green')
+    ax3.set_ylabel('svm accuracy')
+    plt.title('Ellbow-method-like plot')
+    plt.savefig("TextClustering/plots/elbow_method/KMeans_elbow_plot.png", dpi=300)
+    plt.show()
+
+    exit()
+
+km = KMeans(n_clusters=args.k_value, max_iter=200, n_init=10)
+km = km.fit(text_features)
+predictedCluster_text_features = km.predict(text_features)
+
+
+umap_text_features2D = umap.UMAP(n_neighbors=15,
+                                 n_components=2,
+                                 min_dist=0.0, metric='cosine').fit_transform(text_features)
+
+# save umaped vectors:
+df = pd.read_pickle(args.df_cases_file)
+df['umapX_KMeans'] = umap_text_features2D[:, 0]
+df['umapY_KMeans'] = umap_text_features2D[:, 1]
+df['label_KMeans'] = predictedCluster_text_features
+df.to_pickle(args.df_cases_file)
+
+clusters = km.labels_.tolist()
+docs = {'text': diag_lst, 'cluster': clusters}
+
+# %% generate topic words with GT:
+if not text_is_vectorized:
+    frame = pd.DataFrame(docs, index=[clusters])
+    clusters = []
+    word_list_GT = []
+    n_words = 10
+    for cluster in range(0, args.k_value):
+        t_frame = frame[frame['cluster'] == cluster]
+        all_text = " ".join(t_frame['text'].astype(str))
+        top_words = get_most_frequent_words(all_text, n_words)
+
+        clusters.append(cluster)
+        word_list_GT.append(top_words)
+    for i in range(0, len(word_list_GT)):
+        t_token = np.array(word_list_GT[i])
+
+        if len(t_token) < n_words + 1:
+            t_token = np.append(t_token, np.repeat(np.nan, n_words - len(t_token)))
+
+        t_token = t_token.reshape((1, -1))
+        if i == 0:
+            token_list = t_token
+        else:
+            token_list = np.concatenate((token_list, t_token), axis=0)
+
+    pd.DataFrame(token_list).to_excel('TextClustering/tables/WordsPerCluster_kmeans.xlsx',
+                                      sheet_name="GT for kmeans")
+
+
+# %% evalute the model by clustering metrics
+from TextClustering.utils_metrics import ClusterMetrics
+
+evaluation = ClusterMetrics(text_features, km.labels_.tolist(),
+                            file_name="TextClustering/cluster_metrics/KMeans_metrics.pkl")
+
+evaluation.write_to_file()
--- a/TextClustering/basedOn_Embedding/BERT_Diagnosis.py
+++ b/TextClustering/basedOn_Embedding/BERT_Diagnosis.py
+# %% argsparse preamble
+import sys, os
+
+sys.path.append(os.getcwd())
+
+from TextClustering.argsparse_clustering_preamble import argsparse_preamble
+
+args = argsparse_preamble()
+
+from database_preparation.utils_labeled_datasets import is_text_lst_tokenized
+
+if is_text_lst_tokenized(args.path2corpus):
+    print("Error: " + args.path2corpus + ' is tokenized! '
+                                         'Please pass texts list where each text is a single string!')
+    exit(1)
+
+# %% prepare the background
+import pickle
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import umap
+import hdbscan
+from TextClustering.utils_metrics import ClusterMetrics
+from sentence_transformers import SentenceTransformer
+from database_preparation.preprocess import print_meta_data
+
+
+embedding_backup_folder = "database/backup_files/"
+if not os.path.isdir(embedding_backup_folder):
+    os.makedirs(embedding_backup_folder)
+path_2_pathoBERT = "./LanguageModelling/ger-patho-bert-2"
+
+with open(args.path2corpus, 'rb') as f:
+    diag_lst = pickle.load(f)
+
+
+
+print_meta_data(args.path2corpus)
+
+if args.do_embedding:
+
+    # %% load the model
+    if args.model2use == "German_BERT":
+        model = SentenceTransformer("Sahajtomar/German-semantic")
+    elif args.model2use == "Patho_BERT":
+        model = SentenceTransformer(path_2_pathoBERT)
+
+    # %% and apply the embedding-model to the text (only once, since very time-consuming)
+    if not 'embeddings' in locals():
+        embeddings = model.encode(diag_lst, show_progress_bar=True)
+        np.save(embedding_backup_folder + args.model2use + "_embeddingsBackup.npy", embeddings)
+
+# %% load it (if not there)
+if not 'embeddings' in locals():
+    embeddings = np.load(embedding_backup_folder + args.model2use + "_embeddingsBackup.npy")
+
+# %% perform umap
+umap_embeddings = umap.UMAP(n_neighbors=15,
+                            n_components=5,
+                            metric='cosine').fit_transform(embeddings)
+
+# and perform umap-dimension-reduction for visulatizaton
+umap_text_features2D = umap.UMAP(n_neighbors=15,
+                                 n_components=2,
+                                 min_dist=0.0, metric='cosine').fit_transform(embeddings)
+
+if 'umap_embeddings' in locals():
+    np.save(embedding_backup_folder + args.model2use + "_umap_embeddingsBackup.npy", umap_embeddings)
+
+# %% perform repetitive clustering to find the best min_cluster:size
+if not 'umap_embeddings' in locals():
+    umap_embeddings = np.load(embedding_backup_folder + "umap_embeddingsBackup.npy")
+
+if args.find_k_value:
+    cluster_size = range(5, 40, 2)
+    s_score, n_cluster = [], []
+    for i_cluster_size in cluster_size:
+        cluster = hdbscan.HDBSCAN(min_cluster_size=i_cluster_size,
+                                  metric='euclidean',
+                                  cluster_selection_method='eom').fit(umap_embeddings)
+
+        # and evaluate the results with several metrics (not needing ground truth)
+        from TextClustering.utils_metrics import ClusterMetrics
+
+        evaluation = ClusterMetrics(umap_embeddings, cluster.labels_.tolist())
+        s_score.append(evaluation.s_score)
+        n_cluster.append(len(np.unique(cluster.labels_.tolist())))
+
+    fig, ax1 = plt.subplots()
+    ax2 = ax1.twinx()
+    ax1.plot(cluster_size, s_score, 'bx-')
+    ax2.plot(cluster_size, n_cluster, 'rx-')
+    ax1.set_xlabel('Minimal cluster size')
+    ax1.yaxis.label.set_color('blue')
+    ax1.set_ylabel('Silhouette Coefficient')
+    ax2.yaxis.label.set_color('red')
+    ax2.set_ylabel('Number of clusters')
+    plt.title('Ellbow-method-like plot')
+    plt.show()
+    sys.exit()
+
+# %% perform hdbscan-clustering
+if not 'umap_embeddings' in locals():
+    umap_embeddings = np.load(embedding_backup_folder + "umap_embeddingsBackup.npy")
+
+cluster = hdbscan.HDBSCAN(min_cluster_size=args.k_value,
+                          metric='euclidean',
+                          cluster_selection_method='eom').fit(umap_embeddings)
+
+# and print the results
+result = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
+result['labels'] = cluster.labels_.tolist()  # cluster.labels_
+print("cluster indices: " + str(np.unique(result.labels)))
+outliers = result.loc[result.labels == -1, :]
+clustered = result.loc[result.labels != -1, :]
+
+print(str(len(outliers.x)) + " outliers")
+
+# save umaped vectors:
+df = pd.read_pickle(args.df_cases_file)
+df['umapX_' + args.model2use] = result.x
+df['umapY_' + args.model2use] = result.y
+
+# %% update df_cases
+df['label_' + args.model2use] = result.labels
+df.to_pickle(args.df_cases_file)
+
+# %% and evaluate the results with several metrics (not needing ground truth)
+evaluation = ClusterMetrics(umap_embeddings[result.labels >= 0,], clustered.labels.tolist(),
+                            file_name="TextClustering/cluster_metrics/" + args.model2use + "_metrics.pkl")
+evaluation.write_to_file()
--- a/TextClustering/basedOn_Embedding/top2vec.py
+++ b/TextClustering/basedOn_Embedding/top2vec.py
+# Author: Dimo Angelov
+#
+# License: BSD 3 clause
+import logging
+import numpy as np
+import pandas as pd
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+from gensim.utils import simple_preprocess
+from gensim.parsing.preprocessing import strip_tags
+import umap
+import hdbscan
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+from joblib import dump, load
+from sklearn.cluster import dbscan
+import tempfile
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.preprocessing import normalize
+from scipy.special import softmax
+
+try:
+    import hnswlib
+
+    _HAVE_HNSWLIB = True
+except ImportError:
+    _HAVE_HNSWLIB = False
+
+try:
+    import tensorflow as tf
+    import tensorflow_hub as hub
+    import tensorflow_text
+
+    _HAVE_TENSORFLOW = True
+except ImportError:
+    _HAVE_TENSORFLOW = False
+
+try:
+    from sentence_transformers import SentenceTransformer
+
+    _HAVE_TORCH = True
+except ImportError:
+    _HAVE_TORCH = False
+
+logger = logging.getLogger('top2vec')
+logger.setLevel(logging.WARNING)
+sh = logging.StreamHandler()
+sh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+logger.addHandler(sh)
+
+
+def default_tokenizer(doc):
+    """Tokenize documents for training and remove too long/short words"""
+    return simple_preprocess(strip_tags(doc), deacc=True)
+
+
+class Top2Vec:
+    """
+    Top2Vec
+
+    Creates jointly embedded topic, document and word vectors.
+
+
+    Parameters
+    ----------
+    embedding_model: string
+        This will determine which model is used to generate the document and
+        word embeddings. The valid string options are:
+
+            * doc2vec
+            * universal-sentence-encoder
+            * universal-sentence-encoder-multilingual
+            * distiluse-base-multilingual-cased
+
+        For large database_preparation sets and database_preparation sets with very unique vocabulary doc2vec
+        could produce better results. This will train a doc2vec model from
+        scratch. This method is language agnostic. However multiple languages
+        will not be aligned.
+
+        Using the universal sentence encoder options will be much faster since
+        those are pre-trained and efficient models. The universal sentence
+        encoder options are suggested for smaller database_preparation sets. They are also
+        good options for large database_preparation sets that are in English or in languages
+        covered by the multilingual model. It is also suggested for database_preparation sets
+        that are multilingual.
+
+        For more information on universal-sentence-encoder visit:
+        https://tfhub.dev/google/universal-sentence-encoder/4
+
+        For more information on universal-sentence-encoder-multilingual visit:
+        https://tfhub.dev/google/universal-sentence-encoder-multilingual/3
+
+        The distiluse-base-multilingual-cased pre-trained sentence transformer
+        is suggested for multilingual datasets and languages that are not
+        covered by the multilingual universal sentence encoder. The
+        transformer is significantly slower than the universal sentence
+        encoder options.
+
+        For more informati ond istiluse-base-multilingual-cased visit:
+        https://www.sbert.net/docs/pretrained_models.html
+
+    embedding_model_path: string (Optional)
+        Pre-trained embedding models will be downloaded automatically by
+        default. However they can also be uploaded from a file that is in the
+        location of embedding_model_path.
+
+        Warning: the model at embedding_model_path must match the
+        embedding_model parameter type.
+
+    documents: List of str
+        Input corpus, should be a list of strings.
+
+    min_count: int (Optional, default 50)
+        Ignores all words with total frequency lower than this. For smaller
+        corpora a smaller min_count will be necessary.
+
+    speed: string (Optional, default 'learn')
+
+        This parameter is only used when using doc2vec as embedding_model.
+
+        It will determine how fast the model takes to train. The
+        fast-learn option is the fastest and will generate the lowest quality
+        vectors. The learn option will learn better quality vectors but take
+        a longer time to train. The deep-learn option will learn the best
+        quality vectors but will take significant time to train. The valid
+        string speed options are:
+
+            * fast-learn
+            * learn
+            * deep-learn
+
+    use_corpus_file: bool (Optional, default False)
+
+        This parameter is only used when using doc2vec as embedding_model.
+
+        Setting use_corpus_file to True can sometimes provide speedup for
+        large datasets when multiple worker threads are available. Documents
+        are still passed to the model as a list of str, the model will create
+        a temporary corpus file for training.
+
+    document_ids: List of str, int (Optional)
+        A unique value per document that will be used for referring to
+        documents in search results. If ids are not given to the model, the
+        index of each document in the original corpus will become the id.
+
+    keep_documents: bool (Optional, default True)
+        If set to False documents will only be used for training and not saved
+        as part of the model. This will reduce model size. When using search
+        functions only document ids will be returned, not the actual
+        documents.
+
+    workers: int (Optional)
+        The amount of worker threads to be used in training the model. Larger
+        amount will lead to faster training.
+
+    tokenizer: callable (Optional, default None)
+        Override the default tokenization method. If None then
+        gensim.utils.simple_preprocess will be used.
+
+    use_embedding_model_tokenizer: bool (Optional, default False)
+        If using an embedding model other than doc2vec, use the model's
+        tokenizer for document embedding. If set to True the tokenizer, either
+        default or passed callable will be used to tokenize the text to
+        extract the vocabulary for word embedding.
+
+    umap_args: dict (Optional, default None)
+        Pass custom arguments to UMAP.
+
+    hdbscan_args: dict (Optional, default None)
+        Pass custom arguments to HDBSCAN.
+
+    verbose: bool (Optional, default True)
+        Whether to print status database_preparation during training.
+    """
+
+    def __init__(self,
+                 documents,
+                 min_count=50,
+                 embedding_model='doc2vec',
+                 embedding_model_path=None,
+                 speed='learn',
+                 use_corpus_file=False,
+                 document_ids=None,
+                 keep_documents=True,
+                 workers=None,
+                 tokenizer=None,
+                 use_embedding_model_tokenizer=False,
+                 umap_args=None,
+                 hdbscan_args=None,
+                 verbose=True
+                 ):
+
+        if verbose:
+            logger.setLevel(logging.DEBUG)
+            self.verbose = True
+        else:
+            logger.setLevel(logging.WARNING)
+            self.verbose = False
+
+        if tokenizer is None:
+            tokenizer = default_tokenizer
+
+        # validate documents
+        if not (isinstance(documents, list) or isinstance(documents, np.ndarray)):
+            raise ValueError("Documents need to be a list of strings")
+        if not all((isinstance(doc, str) or isinstance(doc, np.str_)) for doc in documents):
+            raise ValueError("Documents need to be a list of strings")
+        if keep_documents:
+            self.documents = np.array(documents, dtype="object")
+        else:
+            self.documents = None
+
+        # validate document ids
+        if document_ids is not None:
+            if not (isinstance(document_ids, list) or isinstance(document_ids, np.ndarray)):
+                raise ValueError("Documents ids need to be a list of str or int")
+
+            if len(documents) != len(document_ids):
+                raise ValueError("Document ids need to match number of documents")
+            elif len(document_ids) != len(set(document_ids)):
+                raise ValueError("Document ids need to be unique")
+
+            if all((isinstance(doc_id, str) or isinstance(doc_id, np.str_)) for doc_id in document_ids):
+                self.doc_id_type = np.str_
+            elif all((isinstance(doc_id, int) or isinstance(doc_id, np.int_)) for doc_id in document_ids):
+                self.doc_id_type = np.int_
+            else:
+                raise ValueError("Document ids need to be str or int")
+
+            self.document_ids_provided = True
+            self.document_ids = np.array(document_ids)
+            self.doc_id2index = dict(zip(document_ids, list(range(0, len(document_ids)))))
+        else:
+            self.document_ids_provided = False
+            self.document_ids = np.array(range(0, len(documents)))
+            self.doc_id2index = dict(zip(self.document_ids, list(range(0, len(self.document_ids)))))
+            self.doc_id_type = np.int_
+
+        acceptable_embedding_models = ["universal-sentence-encoder-multilingual",
+                                       "universal-sentence-encoder",
+                                       "distiluse-base-multilingual-cased"]
+
+        self.embedding_model_path = embedding_model_path
+
+        if embedding_model == 'doc2vec':
+
+            # validate training inputs
+            if speed == "fast-learn":
+                hs = 0
+                negative = 5
+                epochs = 40
+            elif speed == "learn":
+                hs = 1
+                negative = 0
+                epochs = 40
+            elif speed == "deep-learn":
+                hs = 1
+                negative = 0
+                epochs = 400
+            elif speed == "test-learn":
+                hs = 0
+                negative = 5
+                epochs = 1
+            else:
+                raise ValueError("speed parameter needs to be one of: fast-learn, learn or deep-learn")
+
+            if workers is None:
+                pass
+            elif isinstance(workers, int):
+                pass
+            else:
+                raise ValueError("workers needs to be an int")
+
+            doc2vec_args = {"vector_size": 300,
+                            "min_count": min_count,
+                            "window": 15,
+                            "sample": 1e-5,
+                            "negative": negative,
+                            "hs": hs,
+                            "epochs": epochs,
+                            "dm": 0,
+                            "dbow_words": 1}
+
+            if workers is not None:
+                doc2vec_args["workers"] = workers
+
+            logger.info('Pre-processing documents for training')
+
+            if use_corpus_file:
+                processed = [' '.join(tokenizer(doc)) for doc in documents]
+                lines = "\n".join(processed)
+                temp = tempfile.NamedTemporaryFile(mode='w+t')
+                temp.write(lines)
+                doc2vec_args["corpus_file"] = temp.name
+
+
+            else:
+                train_corpus = [TaggedDocument(tokenizer(doc), [i]) for i, doc in enumerate(documents)]
+                doc2vec_args["documents"] = train_corpus
+
+            logger.info('Creating joint document/word embedding')
+            self.embedding_model = 'doc2vec'
+            self.model = Doc2Vec(**doc2vec_args)
+
+            if use_corpus_file:
+                temp.close()
+
+        elif embedding_model in acceptable_embedding_models:
+
+            self.embed = None
+            self.embedding_model = embedding_model
+
+            self._check_import_status()
+
+            logger.info('Pre-processing documents for training')
+
+            # preprocess documents
+            tokenized_corpus = [tokenizer(doc) for doc in documents]
+
+            def return_doc(doc):
+                return doc
+
+            # preprocess vocabulary
+            vectorizer = CountVectorizer(tokenizer=return_doc, preprocessor=return_doc)
+            doc_word_counts = vectorizer.fit_transform(tokenized_corpus)
+            words = vectorizer.get_feature_names()
+            word_counts = np.array(np.sum(doc_word_counts, axis=0).tolist()[0])
+            vocab_inds = np.where(word_counts > min_count)[0]
+
+            if len(vocab_inds) == 0:
+                raise ValueError(f"A min_count of {min_count} results in "
+                                 f"all words being ignored, choose a lower value.")
+            self.vocab = [words[ind] for ind in vocab_inds]
+
+            self._check_model_status()
+
+            logger.info('Creating joint document/word embedding')
+
+            # embed words
+            self.word_indexes = dict(zip(self.vocab, range(len(self.vocab))))
+            self.word_vectors = self._l2_normalize(np.array(self.embed(self.vocab)))
+
+            # embed documents
+            if use_embedding_model_tokenizer:
+                self.document_vectors = self._embed_documents(documents)
+            else:
+                train_corpus = [' '.join(tokens) for tokens in tokenized_corpus]
+                self.document_vectors = self._embed_documents(train_corpus)
+
+        else:
+            raise ValueError(f"{embedding_model} is an invalid embedding model.")
+
+        # create 5D embeddings of documents
+        logger.info('Creating lower dimension embedding of documents')
+
+        if umap_args is None:
+            umap_args = {'n_neighbors': 15,
+                         'n_components': 5,
+                         'metric': 'cosine'}
+
+        self.umap_model = umap.UMAP(**umap_args).fit(self._get_document_vectors(norm=False))
+
+        # find dense areas of document vectors
+        logger.info('Finding dense areas of documents')
+
+        if hdbscan_args is None:
+            hdbscan_args = {'min_cluster_size': 15,
+                            'metric': 'euclidean',
+                            'cluster_selection_method': 'eom'}
+
+        cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(self.umap_model.embedding_)
+
+        # calculate values for printing
+        self.umap_model_2D = umap.UMAP(n_neighbors=15,
+                                 n_components=2,
+                                 min_dist=0.0, metric='cosine').fit_transform(self._get_document_vectors(norm=False))
+
+
+        self.result = pd.DataFrame(self.umap_model_2D, columns=['x', 'y'])
+        self.result['labels'] = cluster.labels_.tolist()
+
+        self.outliers = self.result.loc[self.result.labels == -1, :]
+        self.clustered = self.result.loc[self.result.labels != -1, :]
+
+        # calculate topic vectors from dense areas of documents
+        logger.info('Finding topics')
+
+        # create topic vectors
+        self._create_topic_vectors(cluster.labels_)
+
+        # deduplicate topics
+        self._deduplicate_topics()
+
+        # find topic words and scores
+        self.topic_words, self.topic_word_scores = self._find_topic_words_and_scores(topic_vectors=self.topic_vectors)
+
+        # assign documents to topic
+        self.doc_top, self.doc_dist = self._calculate_documents_topic(self.topic_vectors,
+                                                                      self._get_document_vectors())
+
+        # calculate topic sizes
+        self.topic_sizes = self._calculate_topic_sizes(hierarchy=False)
+
+        # re-order topics
+        self._reorder_topics(hierarchy=False)
+
+        # initialize variables for hierarchical topic reduction
+        self.topic_vectors_reduced = None
+        self.doc_top_reduced = None
+        self.doc_dist_reduced = None
+        self.topic_sizes_reduced = None
+        self.topic_words_reduced = None
+        self.topic_word_scores_reduced = None
+        self.hierarchy = None
+
+        # initialize document indexing variables
+        self.document_index = None
+        self.serialized_document_index = None
+        self.documents_indexed = False
+        self.index_id2doc_id = None
+        self.doc_id2index_id = None
+
+        # initialize word indexing variables
+        self.word_index = None
+        self.serialized_word_index = None
+        self.words_indexed = False
+
+    def save(self, file):
+        """
+        Saves the current model to the specified file.
+
+        Parameters
+        ----------
+        file: str
+            File where model will be saved.
+        """
+
+        document_index_temp = None
+        word_index_temp = None
+
+        # do not save sentence encoders and sentence transformers
+        if self.embedding_model != "doc2vec":
+            self.embed = None
+
+        # serialize document index so that it can be saved
+        if self.documents_indexed:
+            temp = tempfile.NamedTemporaryFile(mode='w+b')
+            self.document_index.save_index(temp.name)
+            self.serialized_document_index = temp.read()
+            temp.close()
+            document_index_temp = self.document_index
+            self.document_index = None
+
+        # serialize word index so that it can be saved
+        if self.words_indexed:
+            temp = tempfile.NamedTemporaryFile(mode='w+b')
+            self.word_index.save_index(temp.name)
+            self.serialized_word_index = temp.read()
+            temp.close()
+            word_index_temp = self.word_index
+            self.word_index = None
+
+        dump(self, file)
+
+        self.document_index = document_index_temp
+        self.word_index = word_index_temp
+
+    @classmethod
+    def load(cls, file):
+        """
+
+        Load a pre-trained model from the specified file.
+
+        Parameters
+        ----------
+        file: str
+            File where model will be loaded from.
+        """
+
+        top2vec_model = load(file)
+
+        # load document index
+        if top2vec_model.documents_indexed:
+            if not _HAVE_HNSWLIB:
+                raise ImportError(f"Cannot load document index.\n\n"
+                                  "Try: pip install top2vec[indexing]\n\n"
+                                  "Alternatively try: pip install hnswlib")
+
+            temp = tempfile.NamedTemporaryFile(mode='w+b')
+            temp.write(top2vec_model.serialized_document_index)
+
+            if top2vec_model.embedding_model == 'doc2vec':
+                document_vectors = top2vec_model.model.dv.vectors
+            else:
+                document_vectors = top2vec_model.document_vectors
+
+            top2vec_model.document_index = hnswlib.Index(space='ip',
+                                                         dim=document_vectors.shape[1])
+            top2vec_model.document_index.load_index(temp.name, max_elements=document_vectors.shape[0])
+            temp.close()
+            top2vec_model.serialized_document_index = None
+
+        # load word index
+        if top2vec_model.words_indexed:
+
+            if not _HAVE_HNSWLIB:
+                raise ImportError(f"Cannot load word index.\n\n"
+                                  "Try: pip install top2vec[indexing]\n\n"
+                                  "Alternatively try: pip install hnswlib")
+
+            temp = tempfile.NamedTemporaryFile(mode='w+b')
+            temp.write(top2vec_model.serialized_word_index)
+
+            if top2vec_model.embedding_model == 'doc2vec':
+                word_vectors = top2vec_model.model.wv.vectors
+            else:
+                word_vectors = top2vec_model.word_vectors
+
+            top2vec_model.word_index = hnswlib.Index(space='ip',
+                                                     dim=word_vectors.shape[1])
+            top2vec_model.word_index.load_index(temp.name, max_elements=word_vectors.shape[0])
+            temp.close()
+            top2vec_model.serialized_word_index = None
+
+        return top2vec_model
+
+    @staticmethod
+    def _l2_normalize(vectors):
+
+        if vectors.ndim == 2:
+            return normalize(vectors)
+        else:
+            return normalize(vectors.reshape(1, -1))[0]
+
+    def _embed_documents(self, train_corpus):
+
+        self._check_import_status()
+        self._check_model_status()
+
+        # embed documents
+        batch_size = 500
+        document_vectors = []
+
+        current = 0
+        batches = int(len(train_corpus) / batch_size)
+        extra = len(train_corpus) % batch_size
+
+        for ind in range(0, batches):
+            document_vectors.append(self.embed(train_corpus[current:current + batch_size]))
+            current += batch_size
+
+        if extra > 0:
+            document_vectors.append(self.embed(train_corpus[current:current + extra]))
+
+        document_vectors = self._l2_normalize(np.array(np.vstack(document_vectors)))
+
+        return document_vectors
+
+    def _embed_query(self, query):
+        self._check_import_status()
+        self._check_model_status()
+
+        return self._l2_normalize(np.array(self.embed([query])[0]))
+
+    def _set_document_vectors(self, document_vectors):
+        if self.embedding_model == 'doc2vec':
+            self.model.dv.vectors = document_vectors
+        else:
+            self.document_vectors = document_vectors
+
+    def _get_document_vectors(self, norm=True):
+
+        if self.embedding_model == 'doc2vec':
+
+            if norm:
+                self.model.dv.init_sims()
+                return self.model.dv.get_normed_vectors()
+            else:
+                return self.model.dv.vectors
+        else:
+            return self.document_vectors
+
+    def _index2word(self, index):
+        if self.embedding_model == 'doc2vec':
+            return self.model.wv.index_to_key[index]
+        else:
+            return self.vocab[index]
+
+    def _get_word_vectors(self):
+        if self.embedding_model == 'doc2vec':
+            self.model.wv.init_sims()
+            return self.model.wv.get_normed_vectors()
+        else:
+            return self.word_vectors
+
+    def _create_topic_vectors(self, cluster_labels):
+
+        unique_labels = set(cluster_labels)
+        if -1 in unique_labels:
+            unique_labels.remove(-1)
+        self.topic_vectors = self._l2_normalize(
+            np.vstack([self._get_document_vectors(norm=False)[np.where(cluster_labels == label)[0]]
+                      .mean(axis=0) for label in unique_labels]))
+
+    def _deduplicate_topics(self):
+        core_samples, labels = dbscan(X=self.topic_vectors,
+                                      eps=0.1,
+                                      min_samples=2,
+                                      metric="cosine")
+
+        duplicate_clusters = set(labels)
+
+        if len(duplicate_clusters) > 1 or -1 not in duplicate_clusters:
+
+            # unique topics
+            unique_topics = self.topic_vectors[np.where(labels == -1)[0]]
+
+            if -1 in duplicate_clusters:
+                duplicate_clusters.remove(-1)
+
+            # merge duplicate topics
+            for unique_label in duplicate_clusters:
+                unique_topics = np.vstack(
+                    [unique_topics, self._l2_normalize(self.topic_vectors[np.where(labels == unique_label)[0]]
+                                                       .mean(axis=0))])
+
+            self.topic_vectors = unique_topics
+
+    def _calculate_topic_sizes(self, hierarchy=False):
+        if hierarchy:
+            topic_sizes = pd.Series(self.doc_top_reduced).value_counts()
+        else:
+            topic_sizes = pd.Series(self.doc_top).value_counts()
+
+        return topic_sizes
+
+    def _reorder_topics(self, hierarchy=False):
+
+        if hierarchy:
+            self.topic_vectors_reduced = self.topic_vectors_reduced[self.topic_sizes_reduced.index]
+            self.topic_words_reduced = self.topic_words_reduced[self.topic_sizes_reduced.index]
+            self.topic_word_scores_reduced = self.topic_word_scores_reduced[self.topic_sizes_reduced.index]
+            old2new = dict(zip(self.topic_sizes_reduced.index, range(self.topic_sizes_reduced.index.shape[0])))
+            self.doc_top_reduced = np.array([old2new[i] for i in self.doc_top_reduced])
+            self.hierarchy = [self.hierarchy[i] for i in self.topic_sizes_reduced.index]
+            self.topic_sizes_reduced.reset_index(drop=True, inplace=True)
+        else:
+            self.topic_vectors = self.topic_vectors[self.topic_sizes.index]
+            self.topic_words = self.topic_words[self.topic_sizes.index]
+            self.topic_word_scores = self.topic_word_scores[self.topic_sizes.index]
+            old2new = dict(zip(self.topic_sizes.index, range(self.topic_sizes.index.shape[0])))
+            self.doc_top = np.array([old2new[i] for i in self.doc_top])
+            self.topic_sizes.reset_index(drop=True, inplace=True)
+
+    @staticmethod
+    def _calculate_documents_topic(topic_vectors, document_vectors, dist=True, num_topics=None):
+        batch_size = 10000
+        doc_top = []
+        if dist:
+            doc_dist = []
+
+        if document_vectors.shape[0] > batch_size:
+            current = 0
+            batches = int(document_vectors.shape[0] / batch_size)
+            extra = document_vectors.shape[0] % batch_size
+
+            for ind in range(0, batches):
+                res = np.inner(document_vectors[current:current + batch_size], topic_vectors)
+
+                if num_topics is None:
+                    doc_top.extend(np.argmax(res, axis=1))
+                    if dist:
+                        doc_dist.extend(np.max(res, axis=1))
+                else:
+                    doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
+                    if dist:
+                        doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])
+
+                current += batch_size
+
+            if extra > 0:
+                res = np.inner(document_vectors[current:current + extra], topic_vectors)
+
+                if num_topics is None:
+                    doc_top.extend(np.argmax(res, axis=1))
+                    if dist:
+                        doc_dist.extend(np.max(res, axis=1))
+                else:
+                    doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
+                    if dist:
+                        doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])
+            if dist:
+                doc_dist = np.array(doc_dist)
+        else:
+            res = np.inner(document_vectors, topic_vectors)
+
+            if num_topics is None:
+                doc_top = np.argmax(res, axis=1)
+                if dist:
+                    doc_dist = np.max(res, axis=1)
+            else:
+                doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
+                if dist:
+                    doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])
+
+        if num_topics is not None:
+            doc_top = np.array(doc_top)
+            if dist:
+                doc_dist = np.array(doc_dist)
+
+        if dist:
+            return doc_top, doc_dist
+        else:
+            return doc_top
+
+    def _find_topic_words_and_scores(self, topic_vectors):
+        topic_words = []
+        topic_word_scores = []
+
+        res = np.inner(topic_vectors, self._get_word_vectors())
+        top_words = np.flip(np.argsort(res, axis=1), axis=1)
+        top_scores = np.flip(np.sort(res, axis=1), axis=1)
+
+        for words, scores in zip(top_words, top_scores):
+            topic_words.append([self._index2word(i) for i in words[0:50]])
+            topic_word_scores.append(scores[0:50])
+
+        topic_words = np.array(topic_words)
+        topic_word_scores = np.array(topic_word_scores)
+
+        return topic_words, topic_word_scores
+
+    def _assign_documents_to_topic(self, document_vectors, hierarchy=False):
+
+        if hierarchy:
+            doc_top_new, doc_dist_new = self._calculate_documents_topic(self.topic_vectors_reduced,
+                                                                        document_vectors,
+                                                                        dist=True)
+            self.doc_top_reduced = np.append(self.doc_top_reduced, doc_top_new)
+            self.doc_dist_reduced = np.append(self.doc_dist_reduced, doc_dist_new)
+
+            topic_sizes_new = pd.Series(doc_top_new).value_counts()
+            for top in topic_sizes_new.index.tolist():
+                self.topic_sizes_reduced[top] += topic_sizes_new[top]
+            self.topic_sizes_reduced.sort_values(ascending=False, inplace=True)
+            self._reorder_topics(hierarchy)
+        else:
+            doc_top_new, doc_dist_new = self._calculate_documents_topic(self.topic_vectors, document_vectors, dist=True)
+            self.doc_top = np.append(self.doc_top, doc_top_new)
+            self.doc_dist = np.append(self.doc_dist, doc_dist_new)
+
+            topic_sizes_new = pd.Series(doc_top_new).value_counts()
+            for top in topic_sizes_new.index.tolist():
+                self.topic_sizes[top] += topic_sizes_new[top]
+            self.topic_sizes.sort_values(ascending=False, inplace=True)
+            self._reorder_topics(hierarchy)
+
+    def _unassign_documents_from_topic(self, doc_indexes, hierarchy=False):
+        if hierarchy:
+            doc_top_remove = self.doc_top_reduced[doc_indexes]
+            self.doc_top_reduced = np.delete(self.doc_top_reduced, doc_indexes, 0)
+            self.doc_dist_reduced = np.delete(self.doc_dist_reduced, doc_indexes, 0)
+            topic_sizes_remove = pd.Series(doc_top_remove).value_counts()
+            for top in topic_sizes_remove.index.tolist():
+                self.topic_sizes_reduced[top] -= topic_sizes_remove[top]
+            self.topic_sizes_reduced.sort_values(ascending=False, inplace=True)
+            self._reorder_topics(hierarchy)
+        else:
+            doc_top_remove = self.doc_top[doc_indexes]
+            self.doc_top = np.delete(self.doc_top, doc_indexes, 0)
+            self.doc_dist = np.delete(self.doc_dist, doc_indexes, 0)
+            topic_sizes_remove = pd.Series(doc_top_remove).value_counts()
+            for top in topic_sizes_remove.index.tolist():
+                self.topic_sizes[top] -= topic_sizes_remove[top]
+            self.topic_sizes.sort_values(ascending=False, inplace=True)
+            self._reorder_topics(hierarchy)
+
+    def _get_document_ids(self, doc_index):
+        return self.document_ids[doc_index]
+
+    def _get_document_indexes(self, doc_ids):
+        if self.document_ids is None:
+            return doc_ids
+        else:
+            return [self.doc_id2index[doc_id] for doc_id in doc_ids]
+
+    def _words2word_vectors(self, keywords):
+
+        return self._get_word_vectors()[[self._word2index(word) for word in keywords]]
+
+    def _word2index(self, word):
+        if self.embedding_model == 'doc2vec':
+            return self.model.wv.vocab[word].index
+        else:
+            return self.word_indexes[word]
+
+    def _get_combined_vec(self, vecs, vecs_neg):
+
+        combined_vector = np.zeros(self._get_document_vectors().shape[1], dtype=np.float64)
+        for vec in vecs:
+            combined_vector += vec
+        for vec in vecs_neg:
+            combined_vector -= vec
+        combined_vector /= (len(vecs) + len(vecs_neg))
+        combined_vector = self._l2_normalize(combined_vector)
+
+        return combined_vector
+
+    @staticmethod
+    def _search_vectors_by_vector(vectors, vector, num_res):
+        ranks = np.inner(vectors, vector)
+        indexes = np.flip(np.argsort(ranks)[-num_res:])
+        scores = np.array([ranks[res] for res in indexes])
+
+        return indexes, scores
+
+    @staticmethod
+    def _check_hnswlib_status():
+        if not _HAVE_HNSWLIB:
+            raise ImportError(f"Indexing is not available.\n\n"
+                              "Try: pip install top2vec[indexing]\n\n"
+                              "Alternatively try: pip install hnswlib")
+
+    def _check_document_index_status(self):
+        if self.document_index is None:
+            raise ImportError("There is no document index.\n\n"
+                              "Call index_document_vectors method before setting use_index=True.")
+
+    def _check_word_index_status(self):
+        if self.word_index is None:
+            raise ImportError("There is no word index.\n\n"
+                              "Call index_word_vectors method before setting use_index=True.")
+
+    def _check_import_status(self):
+        if self.embedding_model != 'distiluse-base-multilingual-cased':
+            if not _HAVE_TENSORFLOW:
+                raise ImportError(f"{self.embedding_model} is not available.\n\n"
+                                  "Try: pip install top2vec[sentence_encoders]\n\n"
+                                  "Alternatively try: pip install tensorflow tensorflow_hub tensorflow_text")
+        else:
+            if not _HAVE_TORCH:
+                raise ImportError(f"{self.embedding_model} is not available.\n\n"
+                                  "Try: pip install top2vec[sentence_transformers]\n\n"
+                                  "Alternatively try: pip install torch sentence_transformers")
+
+    def _check_model_status(self):
+        if self.embed is None:
+            if self.verbose is False:
+                logger.setLevel(logging.DEBUG)
+
+            if self.embedding_model != "distiluse-base-multilingual-cased":
+                if self.embedding_model_path is None:
+                    logger.info(f'Downloading {self.embedding_model} model')
+                    if self.embedding_model == "universal-sentence-encoder-multilingual":
+                        module = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
+                    else:
+                        module = "https://tfhub.dev/google/universal-sentence-encoder/4"
+                else:
+                    logger.info(f'Loading {self.embedding_model} model at {self.embedding_model_path}')
+                    module = self.embedding_model_path
+                self.embed = hub.load(module)
+
+            else:
+                if self.embedding_model_path is None:
+                    logger.info(f'Downloading {self.embedding_model} model')
+                    module = 'distiluse-base-multilingual-cased'
+                else:
+                    logger.info(f'Loading {self.embedding_model} model at {self.embedding_model_path}')
+                    module = self.embedding_model_path
+                model = SentenceTransformer(module)
+                self.embed = model.encode
+
+        if self.verbose is False:
+            logger.setLevel(logging.WARNING)
+
+    @staticmethod
+    def _less_than_zero(num, var_name):
+        if num < 0:
+            raise ValueError(f"{var_name} cannot be less than 0.")
+
+    def _validate_hierarchical_reduction(self):
+        if self.hierarchy is None:
+            raise ValueError("Hierarchical topic reduction has not been performed.")
+
+    def _validate_hierarchical_reduction_num_topics(self, num_topics):
+        current_num_topics = len(self.topic_vectors)
+        if num_topics >= current_num_topics:
+            raise ValueError(f"Number of topics must be less than {current_num_topics}.")
+
+    def _validate_num_docs(self, num_docs):
+        self._less_than_zero(num_docs, "num_docs")
+        document_count = len(self.doc_top)
+        if num_docs > document_count:
+            raise ValueError(f"num_docs cannot exceed the number of documents: {document_count}.")
+
+    def _validate_num_topics(self, num_topics, reduced):
+        self._less_than_zero(num_topics, "num_topics")
+        if reduced:
+            topic_count = len(self.topic_vectors_reduced)
+            if num_topics > topic_count:
+                raise ValueError(f"num_topics cannot exceed the number of reduced topics: {topic_count}.")
+        else:
+            topic_count = len(self.topic_vectors)
+            if num_topics > topic_count:
+                raise ValueError(f"num_topics cannot exceed the number of topics: {topic_count}.")
+
+    def _validate_topic_num(self, topic_num, reduced):
+        self._less_than_zero(topic_num, "topic_num")
+
+        if reduced:
+            topic_count = len(self.topic_vectors_reduced) - 1
+            if topic_num > topic_count:
+                raise ValueError(f"Invalid topic number: valid reduced topics numbers are 0 to {topic_count}.")
+        else:
+            topic_count = len(self.topic_vectors) - 1
+            if topic_num > topic_count:
+                raise ValueError(f"Invalid topic number: valid original topics numbers are 0 to {topic_count}.")
+
+    def _validate_topic_search(self, topic_num, num_docs, reduced):
+        self._less_than_zero(num_docs, "num_docs")
+        if reduced:
+            if num_docs > self.topic_sizes_reduced[topic_num]:
+                raise ValueError(f"Invalid number of documents: reduced topic {topic_num}"
+                                 f" only has {self.topic_sizes_reduced[topic_num]} documents.")
+        else:
+            if num_docs > self.topic_sizes[topic_num]:
+                raise ValueError(f"Invalid number of documents: original topic {topic_num}"
+                                 f" only has {self.topic_sizes[topic_num]} documents.")
+
+    def _validate_doc_ids(self, doc_ids, doc_ids_neg):
+
+        if not (isinstance(doc_ids, list) or isinstance(doc_ids, np.ndarray)):
+            raise ValueError("doc_ids must be a list of string or int.")
+        if not (isinstance(doc_ids_neg, list) or isinstance(doc_ids_neg, np.ndarray)):
+            raise ValueError("doc_ids_neg must be a list of string or int.")
+
+        if isinstance(doc_ids, np.ndarray):
+            doc_ids = list(doc_ids)
+        if isinstance(doc_ids_neg, np.ndarray):
+            doc_ids_neg = list(doc_ids_neg)
+
+        doc_ids_all = doc_ids + doc_ids_neg
+
+        if self.document_ids is not None:
+            for doc_id in doc_ids_all:
+                if doc_id not in self.doc_id2index:
+                    raise ValueError(f"{doc_id} is not a valid document id.")
+        elif min(doc_ids) < 0:
+            raise ValueError(f"{min(doc_ids)} is not a valid document id.")
+        elif max(doc_ids) > len(self.doc_top) - 1:
+            raise ValueError(f"{max(doc_ids)} is not a valid document id.")
+
+    def _validate_keywords(self, keywords, keywords_neg):
+        if not (isinstance(keywords, list) or isinstance(keywords, np.ndarray)):
+            raise ValueError("keywords must be a list of strings.")
+
+        if not (isinstance(keywords_neg, list) or isinstance(keywords_neg, np.ndarray)):
+            raise ValueError("keywords_neg must be a list of strings.")
+
+        keywords_lower = [keyword.lower() for keyword in keywords]
+        keywords_neg_lower = [keyword.lower() for keyword in keywords_neg]
+
+        if self.embedding_model == 'doc2vec':
+            vocab = self.model.wv.vocab
+        else:
+            vocab = self.vocab
+
+        for word in keywords_lower + keywords_neg_lower:
+            if word not in vocab:
+                raise ValueError(f"'{word}' has not been learned by the model so it cannot be searched.")
+
+        return keywords_lower, keywords_neg_lower
+
+    def _validate_document_ids_add_doc(self, documents, document_ids):
+        if document_ids is None:
+            raise ValueError("Document ids need to be provided.")
+        if len(documents) != len(document_ids):
+            raise ValueError("Document ids need to match number of documents.")
+        if len(document_ids) != len(set(document_ids)):
+            raise ValueError("Document ids need to be unique.")
+
+        if len(set(document_ids).intersection(self.document_ids)) > 0:
+            raise ValueError("Some document ids already exist in model.")
+
+        if self.doc_id_type == np.str_:
+            if not all((isinstance(doc_id, str) or isinstance(doc_id, np.str_)) for doc_id in document_ids):
+                raise ValueError("Document ids need to be of type str.")
+
+        if self.doc_id_type == np.int_:
+            if not all((isinstance(doc_id, int) or isinstance(doc_id, np.int_)) for doc_id in document_ids):
+                raise ValueError("Document ids need to be of type int.")
+
+    @staticmethod
+    def _validate_documents(documents):
+        if not all((isinstance(doc, str) or isinstance(doc, np.str_)) for doc in documents):
+            raise ValueError("Documents need to be a list of strings.")
+
+    @staticmethod
+    def _validate_query(query):
+        if not isinstance(query, str) or isinstance(query, np.str_):
+            raise ValueError("Query needs to be a string.")
+
+    def _validate_vector(self, vector):
+        if not isinstance(vector, np.ndarray):
+            raise ValueError("Vector needs to be a numpy array.")
+        vec_size = self._get_document_vectors().shape[1]
+        if not vector.shape[0] == vec_size:
+            raise ValueError(f"Vector needs to be of {vec_size} dimensions.")
+
+    def index_document_vectors(self, ef_construction=200, M=64):
+        """
+        Creates an index of the document vectors using hnswlib. This will
+        lead to faster search times for models with a large number of
+        documents.
+
+        For more information on hnswlib see: https://github.com/nmslib/hnswlib
+
+        Parameters
+        ----------
+        ef_construction: int (Optional default 200)
+            This parameter controls the trade-off between index construction
+            time and index accuracy. Larger values will lead to greater
+            accuracy but will take longer to construct.
+
+        M: int (Optional default 64)
+            This parameter controls the trade-off between both index size as
+            well as construction time and accuracy. Larger values will lead to
+            greater accuracy but will result in a larger index as well as
+            longer construction time.
+
+            For more information on the parameters see:
+            https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+        """
+
+        self._check_hnswlib_status()
+
+        document_vectors = self._get_document_vectors()
+        vec_dim = document_vectors.shape[1]
+        num_vecs = document_vectors.shape[0]
+
+        index_ids = list(range(0, len(self.document_ids)))
+
+        self.index_id2doc_id = dict(zip(index_ids, self.document_ids))
+        self.doc_id2index_id = dict(zip(self.document_ids, index_ids))
+
+        self.document_index = hnswlib.Index(space='ip', dim=vec_dim)
+        self.document_index.init_index(max_elements=num_vecs, ef_construction=ef_construction, M=M)
+        self.document_index.add_items(document_vectors, index_ids)
+        self.documents_indexed = True
+
+    def index_word_vectors(self, ef_construction=200, M=64):
+        """
+        Creates an index of the word vectors using hnswlib. This will
+        lead to faster search times for models with a large number of
+        words.
+
+        For more information on hnswlib see: https://github.com/nmslib/hnswlib
+
+        Parameters
+        ----------
+        ef_construction: int (Optional default 200)
+            This parameter controls the trade-off between index construction
+            time and index accuracy. Larger values will lead to greater
+            accuracy but will take longer to construct.
+
+        M: int (Optional default 64)
+            This parameter controls the trade-off between both index size as
+            well as construction time and accuracy. Larger values will lead to
+            greater accuracy but will result in a larger index as well as
+            longer construction time.
+
+            For more information on the parameters see:
+            https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+        """
+        self._check_hnswlib_status()
+
+        word_vectors = self._get_word_vectors()
+        vec_dim = word_vectors.shape[1]
+        num_vecs = word_vectors.shape[0]
+
+        index_ids = list(range(0, num_vecs))
+
+        self.word_index = hnswlib.Index(space='ip', dim=vec_dim)
+        self.word_index.init_index(max_elements=num_vecs, ef_construction=ef_construction, M=M)
+        self.word_index.add_items(word_vectors, index_ids)
+        self.words_indexed = True
+
+    def update_embedding_model_path(self, embedding_model_path):
+        """
+        Update the path of the embedding model to be loaded. The model will
+        no longer be downloaded but loaded from the path location.
+
+        Warning: the model at embedding_model_path must match the
+        embedding_model parameter type.
+
+        Parameters
+        ----------
+        embedding_model_path: Str
+            Path to downloaded embedding model.
+
+        """
+        self.embedding_model_path = embedding_model_path
+
+    def change_to_download_embedding_model(self):
+        """
+        Use automatic download to load embedding model used for training.
+        Top2Vec will no longer try and load the embedding model from a file
+        if a embedding_model path was previously added.
+
+        """
+        self.embedding_model_path = None
+
+    def get_documents_topics(self, doc_ids, reduced=False, num_topics=1):
+        """
+        Get document topics.
+
+        The topic of each document will be returned.
+
+        The corresponding original topics are returned unless reduced=True,
+        in which case the reduced topics will be returned.
+
+        Parameters
+        ----------
+        doc_ids: List of str, int
+            A unique value per document that is used for referring to
+            documents in search results. If ids were not given to the model,
+            the index of each document in the model is the id.
+
+        reduced: bool (Optional, default False)
+            Original topics are returned by default. If True the
+            reduced topics will be returned.
+
+        num_topics: int (Optional, default 1)
+            The number of topics to return per document.
+
+        Returns
+        -------
+        topic_nums: array of int, shape(len(doc_ids), num_topics)
+            The topic number(s) of the document corresponding to each doc_id.
+
+        topic_score: array of float, shape(len(doc_ids), num_topics)
+            Semantic similarity of document to topic(s). The cosine similarity
+            of the document and topic vector.
+
+        topics_words: array of shape(len(doc_ids), num_topics, 50)
+            For each topic the top 50 words are returned, in order
+            of semantic similarity to topic.
+
+            Example:
+            [['database_preparation', 'deep', 'learning' ... 'artificial'],          <Topic 4>
+            ['environment', 'warming', 'climate ... 'temperature']  <Topic 21>
+            ...]
+
+        word_scores: array of shape(num_topics, 50)
+            For each topic the cosine similarity scores of the
+            top 50 words to the topic are returned.
+
+            Example:
+            [[0.7132, 0.6473, 0.5700 ... 0.3455],  <Topic 4>
+            [0.7818', 0.7671, 0.7603 ... 0.6769]  <Topic 21>
+            ...]
+
+        """
+        if reduced:
+            self._validate_hierarchical_reduction()
+
+        # make sure documents exist
+        self._validate_doc_ids(doc_ids, doc_ids_neg=[])
+
+        # get document indexes from ids
+        doc_indexes = self._get_document_indexes(doc_ids)
+
+        if num_topics == 1:
+            if reduced:
+                doc_topics = self.doc_top_reduced[doc_indexes]
+                doc_dist = self.doc_dist_reduced[doc_indexes]
+                topic_words = self.topic_words_reduced[doc_topics]
+                topic_word_scores = self.topic_word_scores_reduced[doc_topics]
+            else:
+                doc_topics = self.doc_top[doc_indexes]
+                doc_dist = self.doc_dist[doc_indexes]
+                topic_words = self.topic_words[doc_topics]
+                topic_word_scores = self.topic_word_scores[doc_topics]
+
+        else:
+            if reduced:
+                topic_vectors = self.topic_vectors_reduced
+            else:
+                topic_vectors = self.topic_vectors
+
+            doc_topics, doc_dist = self._calculate_documents_topic(topic_vectors,
+                                                                   self._get_document_vectors()[doc_indexes],
+                                                                   num_topics=num_topics)
+
+            topic_words = np.array([self.topic_words[topics] for topics in doc_topics])
+            topic_word_scores = np.array([self.topic_word_scores[topics] for topics in doc_topics])
+
+        return doc_topics, doc_dist, topic_words, topic_word_scores
+
+    def add_documents(self, documents, doc_ids=None, tokenizer=None, use_embedding_model_tokenizer=False):
+        """
+        Update the model with new documents.
+
+        The documents will be added to the current model without changing
+        existing document, word and topic vectors. Topic sizes will be updated.
+
+        If adding a large quantity of documents relative to the current model
+        size, or documents containing a largely new vocabulary, a new model
+        should be trained for best results.
+
+        Parameters
+        ----------
+        documents: List of str
+
+        doc_ids: List of str, int (Optional)
+            Only required when doc_ids were given to the original model.
+
+            A unique value per document that will be used for referring to
+            documents in search results.
+
+        tokenizer: callable (Optional, default None)
+            Override the default tokenization method. If None then
+            gensim.utils.simple_preprocess will be used.
+
+        use_embedding_model_tokenizer: bool (Optional, default False)
+            If using an embedding model other than doc2vec, use the model's
+            tokenizer for document embedding.
+        """
+        # if tokenizer is not passed use default
+        if tokenizer is None:
+            tokenizer = default_tokenizer
+
+        # add documents
+        self._validate_documents(documents)
+        if self.documents is not None:
+            self.documents = np.append(self.documents, documents)
+
+        # add document ids
+        if self.document_ids_provided is True:
+            self._validate_document_ids_add_doc(documents, doc_ids)
+            doc_ids_len = len(self.document_ids)
+            self.document_ids = np.append(self.document_ids, doc_ids)
+            self.doc_id2index.update(dict(zip(doc_ids, list(range(doc_ids_len, doc_ids_len + len(doc_ids))))))
+
+        elif doc_ids is None:
+            num_docs = len(documents)
+            start_id = max(self.document_ids) + 1
+            doc_ids = list(range(start_id, start_id + num_docs))
+            doc_ids_len = len(self.document_ids)
+            self.document_ids = np.append(self.document_ids, doc_ids)
+            self.doc_id2index.update(dict(zip(doc_ids, list(range(doc_ids_len, doc_ids_len + len(doc_ids))))))
+        else:
+            raise ValueError("doc_ids cannot be used because they were not provided to model during training.")
+
+        if self.embedding_model == "doc2vec":
+            docs_processed = [tokenizer(doc) for doc in documents]
+            document_vectors = np.vstack([self.model.infer_vector(doc_words=doc,
+                                                                  alpha=0.025,
+                                                                  min_alpha=0.01,
+                                                                  epochs=100) for doc in docs_processed])
+            num_docs = len(documents)
+            self.model.dv.count += num_docs
+            self.model.dv.max_rawint += num_docs
+            self.model.dv.vectors_norm = None
+            self._set_document_vectors(np.vstack([self._get_document_vectors(norm=False), document_vectors]))
+            self.model.dv.init_sims()
+            document_vectors = self._l2_normalize(document_vectors)
+
+        else:
+            if use_embedding_model_tokenizer:
+                docs_training = documents
+            else:
+                docs_processed = [tokenizer(doc) for doc in documents]
+                docs_training = [' '.join(doc) for doc in docs_processed]
+            document_vectors = self._embed_documents(docs_training)
+            self._set_document_vectors(np.vstack([self._get_document_vectors(), document_vectors]))
+
+        # update index
+        if self.documents_indexed:
+            # update capacity of index
+            current_max = self.document_index.get_max_elements()
+            updated_max = current_max + len(documents)
+            self.document_index.resize_index(updated_max)
+
+            # update index_id and doc_ids
+            start_index_id = max(self.index_id2doc_id.keys()) + 1
+            new_index_ids = list(range(start_index_id, start_index_id + len(doc_ids)))
+            self.index_id2doc_id.update(dict(zip(new_index_ids, doc_ids)))
+            self.doc_id2index_id.update(dict(zip(doc_ids, new_index_ids)))
+            self.document_index.add_items(document_vectors, new_index_ids)
+
+        # update topics
+        self._assign_documents_to_topic(document_vectors, hierarchy=False)
+
+        if self.hierarchy is not None:
+            self._assign_documents_to_topic(document_vectors, hierarchy=True)
+
+    def delete_documents(self, doc_ids):
+        """
+        Delete documents from current model.
+
+        Warning: If document ids were not used in original model, deleting
+        documents will change the indexes and therefore doc_ids.
+
+        The documents will be deleted from the current model without changing
+        existing document, word and topic vectors. Topic sizes will be updated.
+
+        If deleting a large quantity of documents relative to the current model
+        size a new model should be trained for best results.
+
+        Parameters
+        ----------
+        doc_ids: List of str, int
+
+            A unique value per document that is used for referring to documents
+            in search results.
+        """
+        # make sure documents exist
+        self._validate_doc_ids(doc_ids, doc_ids_neg=[])
+
+        # update index
+        if self.documents_indexed:
+            # delete doc_ids from index
+            index_ids = [self.doc_id2index_id(doc_id) for doc_id in doc_ids]
+            for index_id in index_ids:
+                self.document_index.mark_deleted(index_id)
+            # update index_id and doc_ids
+            for doc_id in doc_ids:
+                self.doc_id2index_id.pop(doc_id)
+            for index_id in index_ids:
+                self.index_id2doc_id.pop(index_id)
+
+        # get document indexes from ids
+        doc_indexes = self._get_document_indexes(doc_ids)
+
+        # delete documents
+        if self.documents is not None:
+            self.documents = np.delete(self.documents, doc_indexes, 0)
+
+        # delete document ids
+        if self.document_ids is not None:
+            for doc_id in doc_ids:
+                self.doc_id2index.pop(doc_id)
+            keys = list(self.doc_id2index.keys())
+            self.document_ids = np.array(keys)
+            values = list(range(0, len(self.doc_id2index.values())))
+            self.doc_id2index = dict(zip(keys, values))
+
+        # delete document vectors
+        self._set_document_vectors(np.delete(self._get_document_vectors(norm=False), doc_indexes, 0))
+
+        if self.embedding_model == 'doc2vec':
+            num_docs = len(doc_indexes)
+            self.model.dv.count -= num_docs
+            self.model.dv.max_rawint -= num_docs
+            self.model.dv.vectors_norm= None
+            self.model.dv.init_sims()
+
+        # update topics
+        self._unassign_documents_from_topic(doc_indexes, hierarchy=False)
+
+        if self.hierarchy is not None:
+            self._unassign_documents_from_topic(doc_indexes, hierarchy=True)
+
+    def get_num_topics(self, reduced=False):
+        """
+        Get number of topics.
+
+        This is the number of topics Top2Vec has found in the database_preparation by default.
+        If reduced is True, the number of reduced topics is returned.
+
+        Parameters
+        ----------
+        reduced: bool (Optional, default False)
+            The number of original topics will be returned by default. If True
+            will return the number of reduced topics, if hierarchical topic
+            reduction has been performed.
+
+        Returns
+        -------
+        num_topics: int
+        """
+
+        if reduced:
+            self._validate_hierarchical_reduction()
+            return len(self.topic_vectors_reduced)
+        else:
+            return len(self.topic_vectors)
+
+    def get_topic_sizes(self, reduced=False):
+        """
+        Get topic sizes.
+
+        The number of documents most similar to each topic. Topics are
+        in increasing order of size.
+
+        The sizes of the original topics is returned unless reduced=True,
+        in which case the sizes of the reduced topics will be returned.
+
+        Parameters
+        ----------
+        reduced: bool (Optional, default False)
+            Original topic sizes are returned by default. If True the
+            reduced topic sizes will be returned.
+
+        Returns
+        -------
+        topic_sizes: array of int, shape(num_topics)
+            The number of documents most similar to the topic.
+        topic_nums: array of int, shape(num_topics)
+            The unique number of every topic will be returned.
+        """
+        if reduced:
+            self._validate_hierarchical_reduction()
+            return np.array(self.topic_sizes_reduced.values), np.array(self.topic_sizes_reduced.index)
+        else:
+            return np.array(self.topic_sizes.values), np.array(self.topic_sizes.index)
+
+    def get_topics(self, num_topics=None, reduced=False):
+        """
+        Get topics, ordered by decreasing size. All topics are returned
+        if num_topics is not specified.
+
+        The original topics found are returned unless reduced=True,
+        in which case reduced topics will be returned.
+
+        Each topic will consist of the top 50 semantically similar words
+        to the topic. These are the 50 words closest to topic vector
+        along with cosine similarity of each word from vector. The
+        higher the score the more relevant the word is to the topic.
+
+        Parameters
+        ----------
+        num_topics: int, (Optional)
+            Number of topics to return.
+
+        reduced: bool (Optional, default False)
+            Original topics are returned by default. If True the
+            reduced topics will be returned.
+
+        Returns
+        -------
+        topics_words: array of shape(num_topics, 50)
+            For each topic the top 50 words are returned, in order
+            of semantic similarity to topic.
+
+            Example:
+            [['database_preparation', 'deep', 'learning' ... 'artificial'],         <Topic 0>
+            ['environment', 'warming', 'climate ... 'temperature']  <Topic 1>
+            ...]
+
+        word_scores: array of shape(num_topics, 50)
+            For each topic the cosine similarity scores of the
+            top 50 words to the topic are returned.
+
+            Example:
+            [[0.7132, 0.6473, 0.5700 ... 0.3455],  <Topic 0>
+            [0.7818', 0.7671, 0.7603 ... 0.6769]   <Topic 1>
+            ...]
+
+        topic_nums: array of int, shape(num_topics)
+            The unique number of every topic will be returned.
+        """
+        if reduced:
+            self._validate_hierarchical_reduction()
+
+            if num_topics is None:
+                num_topics = len(self.topic_vectors_reduced)
+            else:
+                self._validate_num_topics(num_topics, reduced)
+
+            return self.topic_words_reduced[0:num_topics], self.topic_word_scores_reduced[0:num_topics], np.array(
+                range(0, num_topics))
+        else:
+
+            if num_topics is None:
+                num_topics = len(self.topic_vectors)
+            else:
+                self._validate_num_topics(num_topics, reduced)
+
+            return self.topic_words[0:num_topics], self.topic_word_scores[0:num_topics], np.array(range(0, num_topics))
+
+    def get_topic_hierarchy(self):
+        """
+        Get the hierarchy of reduced topics. The mapping of each original topic
+        to the reduced topics is returned.
+
+        Hierarchical topic reduction must be performed before calling this
+        method.
+
+        Returns
+        -------
+        hierarchy: list of ints
+            Each index of the hierarchy corresponds to the topic number of a
+            reduced topic. For each reduced topic the topic numbers of the
+            original topics that were merged to create it are listed.
+
+            Example:
+            [[3]  <Reduced Topic 0> contains original Topic 3
+            [2,4] <Reduced Topic 1> contains original Topics 2 and 4
+            [0,1] <Reduced Topic 3> contains original Topics 0 and 1
+            ...]
+        """
+
+        self._validate_hierarchical_reduction()
+
+        return self.hierarchy
+
+    def hierarchical_topic_reduction(self, num_topics):
+        """
+        Reduce the number of topics discovered by Top2Vec.
+
+        The most representative topics of the corpus will be found, by
+        iteratively merging each smallest topic to the most similar topic until
+        num_topics is reached.
+
+        Parameters
+        ----------
+        num_topics: int
+            The number of topics to reduce to.
+
+        Returns
+        -------
+        hierarchy: list of ints
+            Each index of hierarchy corresponds to the reduced topics, for each
+            reduced topic the indexes of the original topics that were merged
+            to create it are listed.
+
+            Example:
+            [[3]  <Reduced Topic 0> contains original Topic 3
+            [2,4] <Reduced Topic 1> contains original Topics 2 and 4
+            [0,1] <Reduced Topic 3> contains original Topics 0 and 1
+            ...]
+        """
+        self._validate_hierarchical_reduction_num_topics(num_topics)
+
+        num_topics_current = self.topic_vectors.shape[0]
+        top_vecs = self.topic_vectors
+        top_sizes = [self.topic_sizes[i] for i in range(0, len(self.topic_sizes))]
+        hierarchy = [[i] for i in range(self.topic_vectors.shape[0])]
+
+        count = 0
+        interval = max(int(self._get_document_vectors().shape[0] / 50000), 1)
+
+        while num_topics_current > num_topics:
+
+            # find smallest and most similar topics
+            smallest = np.argmin(top_sizes)
+            res = np.inner(top_vecs[smallest], top_vecs)
+            sims = np.flip(np.argsort(res))
+            most_sim = sims[1]
+            if most_sim == smallest:
+                most_sim = sims[0]
+
+            # calculate combined topic vector
+            top_vec_smallest = top_vecs[smallest]
+            smallest_size = top_sizes[smallest]
+
+            top_vec_most_sim = top_vecs[most_sim]
+            most_sim_size = top_sizes[most_sim]
+
+            combined_vec = self._l2_normalize(((top_vec_smallest * smallest_size) +
+                                               (top_vec_most_sim * most_sim_size)) / (smallest_size + most_sim_size))
+
+            # update topic vectors
+            ix_keep = list(range(len(top_vecs)))
+            ix_keep.remove(smallest)
+            ix_keep.remove(most_sim)
+            top_vecs = top_vecs[ix_keep]
+            top_vecs = np.vstack([top_vecs, combined_vec])
+            num_topics_current = top_vecs.shape[0]
+
+            # update topics sizes
+            if count % interval == 0:
+                doc_top = self._calculate_documents_topic(topic_vectors=top_vecs,
+                                                          document_vectors=self._get_document_vectors(),
+                                                          dist=False)
+                topic_sizes = pd.Series(doc_top).value_counts()
+                top_sizes = [topic_sizes[i] for i in range(0, len(topic_sizes))]
+
+            else:
+                smallest_size = top_sizes.pop(smallest)
+                if most_sim < smallest:
+                    most_sim_size = top_sizes.pop(most_sim)
+                else:
+                    most_sim_size = top_sizes.pop(most_sim - 1)
+                combined_size = smallest_size + most_sim_size
+                top_sizes.append(combined_size)
+
+            count += 1
+
+            # update topic hierarchy
+            smallest_inds = hierarchy.pop(smallest)
+            if most_sim < smallest:
+                most_sim_inds = hierarchy.pop(most_sim)
+            else:
+                most_sim_inds = hierarchy.pop(most_sim - 1)
+
+            combined_inds = smallest_inds + most_sim_inds
+            hierarchy.append(combined_inds)
+
+        # re-calculate topic vectors from clusters
+        doc_top = self._calculate_documents_topic(topic_vectors=top_vecs,
+                                                  document_vectors=self._get_document_vectors(),
+                                                  dist=False)
+        self.topic_vectors_reduced = self._l2_normalize(np.vstack([self._get_document_vectors()
+                                                                   [np.where(doc_top == label)[0]]
+                                                                  .mean(axis=0) for label in set(doc_top)]))
+
+        self.hierarchy = hierarchy
+
+        # assign documents to topic
+        self.doc_top_reduced, self.doc_dist_reduced = self._calculate_documents_topic(self.topic_vectors_reduced,
+                                                                                      self._get_document_vectors())
+        # find topic words and scores
+        self.topic_words_reduced, self.topic_word_scores_reduced = self._find_topic_words_and_scores(
+            topic_vectors=self.topic_vectors_reduced)
+
+        # calculate topic sizes
+        self.topic_sizes_reduced = self._calculate_topic_sizes(hierarchy=True)
+
+        # re-order topics
+        self._reorder_topics(hierarchy=True)
+
+        return self.hierarchy
+
+    def query_documents(self, query, num_docs, return_documents=True, use_index=False, ef=None, tokenizer=None):
+        """
+        Semantic search of documents using a text query.
+
+        The most semantically similar documents to the query will be returned.
+
+        Parameters
+        ----------
+        query: string
+            Any sequence of text. This could be an actual question, a sentence,
+            a paragraph or a document.
+
+        num_docs: int
+            Number of documents to return.
+
+        return_documents: bool (Optional default True)
+            Determines if the documents will be returned. If they were not
+            saved in the model they will not be returned.
+
+        use_index: bool (Optional default False)
+            If index_documents method has been called, setting this to True
+            will speed up search for models with large number of documents.
+
+        ef: int (Optional default None)
+            Higher ef leads to more accurate but slower search. This value
+            must be higher than num_docs.
+
+            For more information see:
+            https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+
+        tokenizer: callable (Optional, default None)
+
+            ** For doc2vec embedding model only **
+
+            Override the default tokenization method. If None then
+            gensim.utils.simple_preprocess will be used.
+
+        Returns
+        -------
+        documents: (Optional) array of str, shape(num_docs)
+            The documents in a list, the most similar are first.
+
+            Will only be returned if the documents were saved and if
+            return_documents is set to True.
+
+        doc_scores: array of float, shape(num_docs)
+            Semantic similarity of document to vector. The cosine similarity of
+            the document and vector.
+
+        doc_ids: array of int, shape(num_docs)
+            Unique ids of documents. If ids were not given to the model, the
+            index of the document in the model will be returned.
+        """
+
+        self._validate_query(query)
+        self._validate_num_docs(num_docs)
+
+        if self.embedding_model != "doc2vec":
+            query_vec = self._embed_query(query)
+
+        else:
+
+            # if tokenizer is not passed use default
+            if tokenizer is None:
+                tokenizer = default_tokenizer
+
+            tokenized_query = tokenizer(query)
+
+            query_vec = self.model.infer_vector(doc_words=tokenized_query,
+                                                alpha=0.025,
+                                                min_alpha=0.01,
+                                                epochs=100)
+
+        return self.search_documents_by_vector(query_vec, num_docs, return_documents=return_documents,
+                                               use_index=use_index, ef=ef)
+
+    def query_topics(self, query, num_topics, reduced=False, tokenizer=None):
+        """
+        Semantic search of topics using text query.
+
+        These are the topics closest to the vector. Topics are ordered by
+        proximity to the vector. Successive topics in the list are less
+        semantically similar to the vector.
+
+        Parameters
+        ----------
+        query: string
+            Any sequence of text. This could be an actual question, a sentence,
+            a paragraph or a document.
+
+        num_topics: int
+            Number of documents to return.
+
+        reduced: bool (Optional, default False)
+            Original topics are searched by default. If True the
+            reduced topics will be searched.
+
+        tokenizer: callable (Optional, default None)
+
+            ** For doc2vec embedding model only **
+
+            Override the default tokenization method. If None then
+            gensim.utils.simple_preprocess will be used.
+
+        Returns
+        -------
+        topics_words: array of shape (num_topics, 50)
+            For each topic the top 50 words are returned, in order of semantic
+            similarity to topic.
+
+            Example:
+            [['database_preparation', 'deep', 'learning' ... 'artificial'],           <Topic 0>
+            ['environment', 'warming', 'climate ... 'temperature']    <Topic 1>
+            ...]
+
+        word_scores: array of shape (num_topics, 50)
+            For each topic the cosine similarity scores of the top 50 words
+            to the topic are returned.
+
+            Example:
+            [[0.7132, 0.6473, 0.5700 ... 0.3455],     <Topic 0>
+            [0.7818', 0.7671, 0.7603 ... 0.6769]     <Topic 1>
+            ...]
+
+        topic_scores: array of float, shape(num_topics)
+            For each topic the cosine similarity to the search keywords will be
+            returned.
+
+        topic_nums: array of int, shape(num_topics)
+            The unique number of every topic will be returned.
+        """
+
+        self._validate_query(query)
+
+        if self.embedding_model != "doc2vec":
+            query_vec = self._embed_query(query)
+
+        else:
+
+            # if tokenizer is not passed use default
+            if tokenizer is None:
+                tokenizer = default_tokenizer
+
+            tokenized_query = tokenizer(query)
+
+            query_vec = self.model.infer_vector(doc_words=tokenized_query,
+                                                alpha=0.025,
+                                                min_alpha=0.01,
+                                                epochs=100)
+
+        return self.search_topics_by_vector(query_vec, num_topics=num_topics, reduced=reduced)
+
+    def search_documents_by_vector(self, vector, num_docs, return_documents=True, use_index=False, ef=None):
+        """
+        Semantic search of documents using a vector.
+
+        These are the documents closest to the vector. Documents are
+        ordered by proximity to the vector. Successive documents in the
+        list are less semantically similar to the vector.
+
+        Parameters
+        ----------
+        vector: array of shape(vector dimension, 1)
+            The vector dimension should be the same as the vectors in
+            the topic_vectors variable. (i.e. model.topic_vectors.shape[1])
+
+        num_docs: int
+            Number of documents to return.
+
+        return_documents: bool (Optional default True)
+            Determines if the documents will be returned. If they were not
+            saved in the model they will not be returned.
+
+        use_index: bool (Optional default False)
+            If index_documents method has been called, setting this to True
+            will speed up search for models with large number of documents.
+
+        ef: int (Optional default None)
+            Higher ef leads to more accurate but slower search. This value
+            must be higher than num_docs.
+
+            For more information see:
+            https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+
+        Returns
+        -------
+        documents: (Optional) array of str, shape(num_docs)
+            The documents in a list, the most similar are first.
+
+            Will only be returned if the documents were saved and if
+            return_documents is set to True.
+
+        doc_scores: array of float, shape(num_docs)
+            Semantic similarity of document to vector. The cosine similarity of
+            the document and vector.
+
+        doc_ids: array of int, shape(num_docs)
+            Unique ids of documents. If ids were not given to the model, the
+            index of the document in the model will be returned.
+        """
+        self._validate_vector(vector)
+        self._validate_num_docs(num_docs)
+
+        vector = self._l2_normalize(vector)
+
+        if use_index:
+            self._check_document_index_status()
+
+            if ef is not None:
+                self.document_index.set_ef(ef)
+            else:
+                self.document_index.set_ef(num_docs)
+
+            index_ids, doc_scores = self.document_index.knn_query(vector, k=num_docs)
+            index_ids = index_ids[0]
+            doc_ids = np.array([self.index_id2doc_id[index_id] for index_id in index_ids])
+            doc_scores = doc_scores[0]
+            doc_scores = np.array([1 - score for score in doc_scores])
+            doc_indexes = self._get_document_indexes(doc_ids)
+        else:
+            doc_indexes, doc_scores = self._search_vectors_by_vector(self._get_document_vectors(),
+                                                                     vector, num_docs)
+            doc_ids = self._get_document_ids(doc_indexes)
+
+        if self.documents is not None and return_documents:
+            documents = self.documents[doc_indexes]
+            return documents, doc_scores, doc_ids
+        else:
+            return doc_scores, doc_ids
+
+    def search_words_by_vector(self, vector, num_words, use_index=False, ef=None):
+        """
+        Semantic search of words using a vector.
+
+        These are the words closest to the vector. Words are ordered by
+        proximity to the vector. Successive words in the list are less
+        semantically similar to the vector.
+
+        Parameters
+        ----------
+        vector: array of shape(vector dimension, 1)
+            The vector dimension should be the same as the vectors in
+            the topic_vectors variable. (i.e. model.topic_vectors.shape[1])
+
+        num_words: int
+            Number of words to return.
+
+        use_index: bool (Optional default False)
+            If index_words method has been called, setting this to True will
+            speed up search for models with large number of words.
+
+        ef: int (Optional default None)
+            Higher ef leads to more accurate but slower search. This value
+            must be higher than num_docs.
+
+            For more information see:
+            https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+
+        Returns
+        -------
+        words: array of str, shape(num_words)
+            The words in a list, the most similar are first.
+
+        word_scores: array of float, shape(num_words)
+            Semantic similarity of word to vector. The cosine similarity of
+            the word and vector.
+        """
+
+        self._validate_vector(vector)
+
+        vector = self._l2_normalize(vector)
+
+        if use_index:
+            self._check_word_index_status()
+
+            if ef is not None:
+                self.word_index.set_ef(ef)
+            else:
+                self.word_index.set_ef(num_words)
+
+            word_indexes, word_scores = self.word_index.knn_query(vector, k=num_words)
+            word_indexes = word_indexes[0]
+            word_scores = word_scores[0]
+            word_scores = np.array([1 - score for score in word_scores])
+
+        else:
+            word_indexes, word_scores = self._search_vectors_by_vector(self._get_word_vectors(),
+                                                                       vector, num_words)
+
+        words = np.array([self._index2word(index) for index in word_indexes])
+
+        return words, word_scores
+
+    def search_topics_by_vector(self, vector, num_topics, reduced=False):
+        """
+        Semantic search of topics using keywords.
+
+        These are the topics closest to the vector. Topics are ordered by
+        proximity to the vector. Successive topics in the list are less
+        semantically similar to the vector.
+
+        Parameters
+        ----------
+        vector: array of shape(vector dimension, 1)
+            The vector dimension should be the same as the vectors in
+            the topic_vectors variable. (i.e. model.topic_vectors.shape[1])
+
+        num_topics: int
+            Number of documents to return.
+
+        reduced: bool (Optional, default False)
+            Original topics are searched by default. If True the
+            reduced topics will be searched.
+
+        Returns
+        -------
+        topics_words: array of shape (num_topics, 50)
+            For each topic the top 50 words are returned, in order of semantic
+            similarity to topic.
+
+            Example:
+            [['database_preparation', 'deep', 'learning' ... 'artificial'],           <Topic 0>
+            ['environment', 'warming', 'climate ... 'temperature']    <Topic 1>
+            ...]
+
+        word_scores: array of shape (num_topics, 50)
+            For each topic the cosine similarity scores of the top 50 words
+            to the topic are returned.
+
+            Example:
+            [[0.7132, 0.6473, 0.5700 ... 0.3455],     <Topic 0>
+            [0.7818', 0.7671, 0.7603 ... 0.6769]     <Topic 1>
+            ...]
+
+        topic_scores: array of float, shape(num_topics)
+            For each topic the cosine similarity to the search keywords will be
+            returned.
+
+        topic_nums: array of int, shape(num_topics)
+            The unique number of every topic will be returned.
+        """
+
+        self._validate_vector(vector)
+        self._validate_num_topics(num_topics, reduced)
+
+        vector = self._l2_normalize(vector)
+
+        if reduced:
+            self._validate_hierarchical_reduction()
+
+            topic_nums, topic_scores = self._search_vectors_by_vector(self.topic_vectors_reduced,
+                                                                      vector, num_topics)
+            topic_words = [self.topic_words_reduced[topic] for topic in topic_nums]
+            word_scores = [self.topic_word_scores_reduced[topic] for topic in topic_nums]
+
+        else:
+            topic_nums, topic_scores = self._search_vectors_by_vector(self.topic_vectors,
+                                                                      vector, num_topics)
+            topic_words = [self.topic_words[topic] for topic in topic_nums]
+            word_scores = [self.topic_word_scores[topic] for topic in topic_nums]
+
+        return topic_words, word_scores, topic_scores, topic_nums
+
+    def search_documents_by_topic(self, topic_num, num_docs, return_documents=True, reduced=False):
+        """
+        Get the most semantically similar documents to the topic.
+
+        These are the documents closest to the topic vector. Documents are
+        ordered by proximity to the topic vector. Successive documents in the
+        list are less semantically similar to the topic.
+
+        Parameters
+        ----------
+        topic_num: int
+            The topic number to search.
+
+        num_docs: int
+            Number of documents to return.
+
+        return_documents: bool (Optional default True)
+            Determines if the documents will be returned. If they were not
+            saved in the model they will not be returned.
+
+        reduced: bool (Optional, default False)
+            Original topics are used to search by default. If True the
+            reduced topics will be used.
+
+        Returns
+        -------
+        documents: (Optional) array of str, shape(num_docs)
+            The documents in a list, the most similar are first.
+
+            Will only be returned if the documents were saved and if
+            return_documents is set to True.
+
+        doc_scores: array of float, shape(num_docs)
+            Semantic similarity of document to topic. The cosine similarity of
+            the document and topic vector.
+
+        doc_ids: array of int, shape(num_docs)
+            Unique ids of documents. If ids were not given to the model, the
+            index of the document in the model will be returned.
+        """
+
+        if reduced:
+            self._validate_hierarchical_reduction()
+            self._validate_topic_num(topic_num, reduced)
+            self._validate_topic_search(topic_num, num_docs, reduced)
+
+            topic_document_indexes = np.where(self.doc_top_reduced == topic_num)[0]
+            topic_document_indexes_ordered = np.flip(np.argsort(self.doc_dist_reduced[topic_document_indexes]))
+            doc_indexes = topic_document_indexes[topic_document_indexes_ordered][0:num_docs]
+            doc_scores = self.doc_dist_reduced[doc_indexes]
+            doc_ids = self._get_document_ids(doc_indexes)
+
+        else:
+
+            self._validate_topic_num(topic_num, reduced)
+            self._validate_topic_search(topic_num, num_docs, reduced)
+
+            topic_document_indexes = np.where(self.doc_top == topic_num)[0]
+            topic_document_indexes_ordered = np.flip(np.argsort(self.doc_dist[topic_document_indexes]))
+            doc_indexes = topic_document_indexes[topic_document_indexes_ordered][0:num_docs]
+            doc_scores = self.doc_dist[doc_indexes]
+            doc_ids = self._get_document_ids(doc_indexes)
+
+        if self.documents is not None and return_documents:
+            documents = self.documents[doc_indexes]
+            return documents, doc_scores, doc_ids
+        else:
+            return doc_scores, doc_ids
+
+    def search_documents_by_keywords(self, keywords, num_docs, keywords_neg=None, return_documents=True,
+                                     use_index=False, ef=None):
+        """
+        Semantic search of documents using keywords.
+
+        The most semantically similar documents to the combination of the
+        keywords will be returned. If negative keywords are provided, the
+        documents will be semantically dissimilar to those words. Too many
+        keywords or certain combinations of words may give strange results.
+        This method finds an average vector(negative keywords are subtracted)
+        of all the keyword vectors and returns the documents closest to the
+        resulting vector.
+
+        Parameters
+        ----------
+        keywords: List of str
+            List of positive keywords being used for search of semantically
+            similar documents.
+
+        keywords_neg: List of str (Optional)
+            List of negative keywords being used for search of semantically
+            dissimilar documents.
+
+        num_docs: int
+            Number of documents to return.
+
+        return_documents: bool (Optional default True)
+            Determines if the documents will be returned. If they were not
+            saved in the model they will also not be returned.
+
+        use_index: bool (Optional default False)
+            If index_documents method has been called, setting this to True
+            will speed up search for models with large number of documents.
+
+        ef: int (Optional default None)
+            Higher ef leads to more accurate but slower search. This value
+            must be higher than num_docs.
+
+            For more information see:
+            https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+
+        Returns
+        -------
+        documents: (Optional) array of str, shape(num_docs)
+            The documents in a list, the most similar are first.
+
+            Will only be returned if the documents were saved and if
+            return_documents is set to True.
+
+        doc_scores: array of float, shape(num_docs)
+            Semantic similarity of document to keywords. The cosine similarity
+            of the document and average of keyword vectors.
+
+        doc_ids: array of int, shape(num_docs)
+            Unique ids of documents. If ids were not given to the model, the
+            index of the document in the model will be returned.
+        """
+
+        if keywords_neg is None:
+            keywords_neg = []
+
+        self._validate_num_docs(num_docs)
+        keywords, keywords_neg = self._validate_keywords(keywords, keywords_neg)
+        word_vecs = self._words2word_vectors(keywords)
+        neg_word_vecs = self._words2word_vectors(keywords_neg)
+
+        if use_index:
+            self._check_document_index_status()
+            combined_vector = self._get_combined_vec(word_vecs, neg_word_vecs)
+            return self.search_documents_by_vector(combined_vector, num_docs, return_documents=return_documents,
+                                                   use_index=True, ef=ef)
+
+        if self.embedding_model == 'doc2vec':
+            sim_docs = self.model.dv.most_similar(positive=word_vecs,
+                                                       negative=neg_word_vecs,
+                                                       topn=num_docs)
+            doc_indexes = [doc[0] for doc in sim_docs]
+            doc_scores = np.array([doc[1] for doc in sim_docs])
+        else:
+            combined_vector = self._get_combined_vec(word_vecs, neg_word_vecs)
+            doc_indexes, doc_scores = self._search_vectors_by_vector(self._get_document_vectors(),
+                                                                     combined_vector, num_docs)
+
+        doc_ids = self._get_document_ids(doc_indexes)
+
+        if self.documents is not None and return_documents:
+            documents = self.documents[doc_indexes]
+            return documents, doc_scores, doc_ids
+        else:
+            return doc_scores, doc_ids
+
+    def similar_words(self, keywords, num_words, keywords_neg=None, use_index=False, ef=None):
+        """
+        Semantic similarity search of words.
+
+        The most semantically similar word to the combination of the keywords
+        will be returned. If negative keywords are provided, the words will be
+        semantically dissimilar to those words. Too many keywords or certain
+        combinations of words may give strange results. This method finds an
+        average vector(negative keywords are subtracted) of all the keyword
+        vectors and returns the words closest to the resulting vector.
+
+        Parameters
+        ----------
+        keywords: List of str
+            List of positive keywords being used for search of semantically
+            similar words.
+
+        keywords_neg: List of str
+            List of negative keywords being used for search of semantically
+            dissimilar words.
+
+        num_words: int
+            Number of words to return.
+
+        use_index: bool (Optional default False)
+            If index_words method has been called, setting this to True will
+            speed up search for models with large number of words.
+
+        ef: int (Optional default None)
+            Higher ef leads to more accurate but slower search. This value
+            must be higher than num_docs.
+
+            For more information see:
+            https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+
+        Returns
+        -------
+        words: array of str, shape(num_words)
+            The words in a list, the most similar are first.
+
+        word_scores: array of float, shape(num_words)
+            Semantic similarity of word to keywords. The cosine similarity of
+            the word and average of keyword vectors.
+        """
+        if keywords_neg is None:
+            keywords_neg = []
+
+        keywords, keywords_neg = self._validate_keywords(keywords, keywords_neg)
+
+        word_vecs = self._words2word_vectors(keywords)
+        neg_word_vecs = self._words2word_vectors(keywords_neg)
+        combined_vector = self._get_combined_vec(word_vecs, neg_word_vecs)
+
+        num_res = min(num_words + len(keywords) + len(keywords_neg), self._get_word_vectors().shape[0])
+
+        # if use_index:
+        words, word_scores = self.search_words_by_vector(vector=combined_vector,
+                                                         num_words=num_res,
+                                                         use_index=use_index,
+                                                         ef=ef)
+
+        res_indexes = [index for index, word in enumerate(words)
+                       if word not in list(keywords) + list(keywords_neg)][:num_words]
+        words = words[res_indexes]
+        word_scores = word_scores[res_indexes]
+
+        return words, word_scores
+
+    def search_topics(self, keywords, num_topics, keywords_neg=None, reduced=False):
+        """
+        Semantic search of topics using keywords.
+
+        The most semantically similar topics to the combination of the keywords
+        will be returned. If negative keywords are provided, the topics will be
+        semantically dissimilar to those words. Topics will be ordered by
+        decreasing similarity to the keywords. Too many keywords or certain
+        combinations of words may give strange results. This method finds an
+        average vector(negative keywords are subtracted) of all the keyword
+        vectors and returns the topics closest to the resulting vector.
+
+        Parameters
+        ----------
+        keywords: List of str
+            List of positive keywords being used for search of semantically
+            similar documents.
+
+        keywords_neg: (Optional) List of str
+            List of negative keywords being used for search of semantically
+            dissimilar documents.
+
+        num_topics: int
+            Number of documents to return.
+
+        reduced: bool (Optional, default False)
+            Original topics are searched by default. If True the
+            reduced topics will be searched.
+
+        Returns
+        -------
+        topics_words: array of shape (num_topics, 50)
+            For each topic the top 50 words are returned, in order of semantic
+            similarity to topic.
+
+            Example:
+            [['database_preparation', 'deep', 'learning' ... 'artificial'],           <Topic 0>
+            ['environment', 'warming', 'climate ... 'temperature']    <Topic 1>
+            ...]
+
+        word_scores: array of shape (num_topics, 50)
+            For each topic the cosine similarity scores of the top 50 words
+            to the topic are returned.
+
+            Example:
+            [[0.7132, 0.6473, 0.5700 ... 0.3455],     <Topic 0>
+            [0.7818', 0.7671, 0.7603 ... 0.6769]     <Topic 1>
+            ...]
+
+        topic_scores: array of float, shape(num_topics)
+            For each topic the cosine similarity to the search keywords will be
+            returned.
+
+        topic_nums: array of int, shape(num_topics)
+            The unique number of every topic will be returned.
+        """
+        if keywords_neg is None:
+            keywords_neg = []
+
+        keywords, keywords_neg = self._validate_keywords(keywords, keywords_neg)
+        word_vecs = self._words2word_vectors(keywords)
+        neg_word_vecs = self._words2word_vectors(keywords_neg)
+        combined_vector = self._get_combined_vec(word_vecs, neg_word_vecs)
+
+        return self.search_topics_by_vector(combined_vector, num_topics=num_topics, reduced=reduced)
+
+    def search_documents_by_documents(self, doc_ids, num_docs, doc_ids_neg=None, return_documents=True,
+                                      use_index=False, ef=None):
+        """
+        Semantic similarity search of documents.
+
+        The most semantically similar documents to the semantic combination of
+        document ids provided will be returned. If negative document ids are
+        provided, the documents will be semantically dissimilar to those
+        document ids. Documents will be ordered by decreasing similarity. This
+        method finds the closest document vectors to the provided documents
+        averaged.
+
+        Parameters
+        ----------
+        doc_ids: List of int, str
+            Unique ids of document. If ids were not given, the index of
+            document in the original corpus.
+
+        doc_ids_neg: (Optional) List of int, str
+            Unique ids of document. If ids were not given, the index of
+            document in the original corpus.
+
+        num_docs: int
+            Number of documents to return.
+
+        return_documents: bool (Optional default True)
+            Determines if the documents will be returned. If they were not
+            saved in the model they will also not be returned.
+
+        use_index: bool (Optional default False)
+            If index_documents method has been called, setting this to True
+            will speed up search for models with large number of documents.
+
+        ef: int (Optional default None)
+            Higher ef leads to more accurate but slower search. This value
+            must be higher than num_docs.
+
+            For more information see:
+            https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
+
+        Returns
+        -------
+        documents: (Optional) array of str, shape(num_docs)
+            The documents in a list, the most similar are first.
+
+            Will only be returned if the documents were saved and if
+            return_documents is set to True.
+
+        doc_scores: array of float, shape(num_docs)
+            Semantic similarity of document to keywords. The cosine similarity
+            of the document and average of keyword vectors.
+
+        doc_ids: array of int, shape(num_docs)
+            Unique ids of documents. If ids were not given to the model, the
+            index of the document in the model will be returned.
+        """
+        if doc_ids_neg is None:
+            doc_ids_neg = []
+
+        self._validate_num_docs(num_docs)
+        self._validate_doc_ids(doc_ids, doc_ids_neg)
+
+        doc_indexes = self._get_document_indexes(doc_ids)
+        doc_indexes_neg = self._get_document_indexes(doc_ids_neg)
+
+        if use_index:
+            self._check_document_index_status()
+            document_vectors = self._get_document_vectors()
+            doc_vecs = [document_vectors[ind] for ind in doc_indexes]
+            doc_vecs_neg = [document_vectors[ind] for ind in doc_indexes_neg]
+            combined_vector = self._get_combined_vec(doc_vecs, doc_vecs_neg)
+            return self.search_documents_by_vector(combined_vector, num_docs, return_documents=return_documents,
+                                                   use_index=True, ef=ef)
+
+        if self.embedding_model == 'doc2vec':
+            sim_docs = self.model.dv.most_similar(positive=doc_indexes,
+                                                       negative=doc_indexes_neg,
+                                                       topn=num_docs)
+            doc_indexes = [doc[0] for doc in sim_docs]
+            doc_scores = np.array([doc[1] for doc in sim_docs])
+        else:
+            doc_vecs = [self.document_vectors[ind] for ind in doc_indexes]
+            doc_vecs_neg = [self.document_vectors[ind] for ind in doc_indexes_neg]
+            combined_vector = self._get_combined_vec(doc_vecs, doc_vecs_neg)
+
+            num_res = min(num_docs + len(doc_indexes) + len(doc_indexes_neg),
+                          self._get_document_vectors().shape[0])
+
+            # don't return documents that were searched
+            search_doc_indexes = list(doc_indexes) + list(doc_indexes_neg)
+            doc_indexes, doc_scores = self._search_vectors_by_vector(self._get_document_vectors(),
+                                                                     combined_vector, num_res)
+            res_indexes = [index for index, doc_ind in enumerate(doc_indexes)
+                           if doc_ind not in search_doc_indexes][:num_docs]
+            doc_indexes = doc_indexes[res_indexes]
+            doc_scores = doc_scores[res_indexes]
+
+        doc_ids = self._get_document_ids(doc_indexes)
+
+        if self.documents is not None and return_documents:
+            documents = self.documents[doc_indexes]
+            return documents, doc_scores, doc_ids
+        else:
+            return doc_scores, doc_ids
+
+    def generate_topic_wordcloud(self, topic_num, background_color="black", reduced=False):
+        """
+        Create a word cloud for a topic.
+
+        A word cloud will be generated and displayed. The most semantically
+        similar words to the topic will have the largest size, less similar
+        words will be smaller. The size is determined using the cosine distance
+        of the word vectors from the topic vector.
+
+        Parameters
+        ----------
+        topic_num: int
+            The topic number to search.
+
+        background_color : str (Optional, default='white')
+            Background color for the word cloud image. Suggested options are:
+                * white
+                * black
+
+        reduced: bool (Optional, default False)
+            Original topics are used by default. If True the
+            reduced topics will be used.
+
+        Returns
+        -------
+        A matplotlib plot of the word cloud with the topic number will be
+        displayed.
+
+        """
+
+        if reduced:
+            self._validate_hierarchical_reduction()
+            self._validate_topic_num(topic_num, reduced)
+            word_score_dict = dict(zip(self.topic_words_reduced[topic_num],
+                                       softmax(self.topic_word_scores_reduced[topic_num])))
+        else:
+            self._validate_topic_num(topic_num, reduced)
+            word_score_dict = dict(zip(self.topic_words[topic_num],
+                                       softmax(self.topic_word_scores[topic_num])))
+
+        plt.figure(figsize=(16, 4),
+                   dpi=200)
+        plt.axis("off")
+        plt.imshow(
+            WordCloud(width=1600,
+                      height=400,
+                      background_color=background_color).generate_from_frequencies(word_score_dict))
+        plt.title("Topic " + str(topic_num), loc='left', fontsize=25, pad=20)
\ No newline at end of file
--- a/TextClustering/basedOn_Embedding/top2vec_Diagnosis.py
+++ b/TextClustering/basedOn_Embedding/top2vec_Diagnosis.py
+import sys, os
+from tqdm import tqdm
+import pandas as pd
+import pickle
+from database_preparation.preprocess import print_meta_data
+from database_preparation.utils_labeled_datasets import is_text_lst_tokenized
+sys.path.append(os.getcwd())
+
+# parse arguments:
+from TextClustering.argsparse_clustering_preamble import argsparse_preamble
+args = argsparse_preamble()
+
+print("arguments:")
+print(args)
+if is_text_lst_tokenized(args.path2corpus):
+    print("Error: "+args.path2corpus + '.pkl is tokenized! '
+            'Please pass texts list where each text is a single string!')
+    exit(1)
+
+#%% load the data
+with open(args.path2corpus, 'rb') as f:
+    diag_lst = pickle.load(f)
+
+
+print_meta_data(args.path2corpus)
+
+#%% perform clustering repetitive
+if args.find_k_value:
+
+    from TextClustering.utils_metrics import ClusterMetrics
+    from TextClustering.basedOn_Embedding.top2vec import Top2Vec
+    import matplotlib.pyplot as plt # load our modified version (for visualization)
+
+    s_score, n_cluster = [], []
+    cluster_size = range(3, 25, 2)
+
+    for i_cluster_size in tqdm(cluster_size):
+
+        #%% perform text-clustering (like in the paper)
+        hdbscan_args = {'min_cluster_size': i_cluster_size,
+                        'metric': 'euclidean',
+                        'cluster_selection_method': 'eom'}
+        model = Top2Vec(diag_lst,
+                        embedding_model=args.model2use,
+                        min_count=0,
+                        hdbscan_args=hdbscan_args)
+
+        #%% get the clusters
+        n_cluster.append(model.get_num_topics())
+        evaluation = ClusterMetrics(model.umap_model.embedding_[model.result.labels >= 0,],
+                                    model.clustered.labels.tolist())
+        s_score.append(evaluation.s_score)
+
+    #%% plot the results
+    fig, ax1 = plt.subplots()
+    ax2 = ax1.twinx()
+    ax1.plot(cluster_size, s_score, 'bx-')
+    ax2.plot(cluster_size, n_cluster, 'rx-')
+    ax1.set_xlabel('Minimal cluster size')
+    ax1.yaxis.label.set_color('blue')
+    ax1.set_ylabel('Silhouette Coefficient')
+    ax2.yaxis.label.set_color('red')
+    ax2.set_ylabel('Number of clusters')
+    plt.title('Ellbow-method-like plot')
+    plt.show()
+    exit()
+
+#%% perform text-clustering (like in the paper)
+from TextClustering.basedOn_Embedding.top2vec import Top2Vec # load our modified version (for visualization)
+
+hdbscan_args = {'min_cluster_size': args.k_value,
+                'metric': 'euclidean',
+                'cluster_selection_method': 'eom'}
+model = Top2Vec(diag_lst,
+                embedding_model = args.model2use,
+                min_count = 0,
+                hdbscan_args=hdbscan_args)
+
+#%% get the words and topics
+# by their way
+model.get_num_topics()
+topic_sizes, topic_nums = model.get_topic_sizes()
+
+# print some infos:
+outliers = model.umap_model.embedding_[model.result.labels == -1,]
+print(f"found {len(topic_nums)} topics")
+print(f"found {len(outliers)} outilers.")
+
+topic_words, word_scores, topic_nums = model.get_topics(len(topic_nums))
+pd.DataFrame(topic_words).to_excel(
+    'TextClustering/tables/WordsPerCluster_Top2Vec_' + args.model2use  + '.xlsx',
+    sheet_name= "in-function")
+
+# save umaped vectors and labels:
+df = pd.read_pickle(args.df_cases_file)
+df['umapX_top2vec'] = model.result.x
+df['umapY_top2vec'] = model.result.y
+df['label_top2vec'] = model.result.labels
+df.to_pickle(args.df_cases_file)
+
+#%% calculate clustering-metrics
+from TextClustering.utils_metrics import ClusterMetrics
+evaluation = ClusterMetrics(model.umap_model.embedding_[model.result.labels >= 0,], model.clustered.labels.tolist(),
+                            file_name= "TextClustering/cluster_metrics/top2vec_metrics.pkl")
+evaluation.write_to_file()
\ No newline at end of file
--- a/TextClustering/cluster_scores2latextable.py
+++ b/TextClustering/cluster_scores2latextable.py
+# -*- coding: iso-8859-1 -*-
+import pickle
+from os import listdir
+from os.path import isfile, join
+
+import numpy as np
+import pandas as pd
+import database_preparation.utils_labeled_datasets as dt
+from TextClassification.classification_for_cluster_evaluation import cross_validate_with_simple_SVM
+from CorpusHomogeneity.cluster_entropy import cluster_entropy
+from CorpusHomogeneity.text_entropy import corpus_entropy
+
+recalc_cls_accuracy = True
+use_always_bow_data_for_svm_accuracy = True
+sort_table_by = ['s-score'] # s-score or cls accuracy
+table_save_path = 'TextClustering/tables/cluster_metrics_overview'
+path2corpus_bow_preprocessed = 'database/bow_prepro_diag.pkl'
+path2corpus_embedding_preprocessed = 'database/embedding_prepro_diag.pkl'
+scorepath = "TextClustering/cluster_metrics/"
+df_cases_file = './database/df_cases.pkl'
+
+def main():
+
+    # ########## print cluster scores as latex table: ##################
+
+    methodnames = ['KMeans', 'LDA', 'HDBSCAN', 'German_BERT', 'Patho_BERT', 'top2vec', 'GSDPMM']
+    skipped_methods = []
+
+    print(dt.get_all_label_set_ids())
+
+    s_scores = []
+    entropy_scores = []
+    cls_ac_scores = []
+    cluster_nums = []
+    report_nums = []
+    round_to = 3
+
+
+    for label_set in methodnames:
+
+        try:
+            scores = pd.read_pickle(scorepath + label_set + "_metrics.pkl")[label_set+'_metrics']
+        except:
+            print(f"skipping {label_set}.")
+            skipped_methods.append(label_set)
+            continue
+        if label_set in ['German_BERT', 'Patho_BERT', 'top2vec']:
+            text_corpus_path = path2corpus_embedding_preprocessed
+        else:
+            text_corpus_path = path2corpus_bow_preprocessed
+
+        try:
+            s_scores.append(round(scores['s-score'],3))
+        except:
+            s_scores.append(None)
+
+        try:
+            cluster_nums.append(str(dt.get_amount_unique_labels(label_set)))
+        except:
+            cluster_nums.append(None)
+
+        try:
+            report_nums.append(str(dt.get_amount_reports(label_set)))
+        except:
+            report_nums.append(None)
+
+        ### cls accuracy with svm ###
+        if recalc_cls_accuracy:
+            if use_always_bow_data_for_svm_accuracy:
+                metrics = cross_validate_with_simple_SVM(label_set,
+                                                          path2corpus_bow_preprocessed,
+                                                          df_cases_file)
+            else:
+                metrics = cross_validate_with_simple_SVM(label_set,
+                                                         text_corpus_path,
+                                                         df_cases_file)
+
+            print("================ f1-per cluster for cluster-set: " + label_set + " ================")
+            df = metrics.classes_scores(-1)
+            print(df.to_latex().replace('{}', 'cluster'))
+            cls_ac_scores.append(round(np.mean(metrics.scores['accuracy']), round_to))
+        else:
+            try:
+                cls_ac_scores.append(round(scores['svm-accuracy'], round_to))
+            except:
+                report_nums.append(None)
+
+        ### calculate entropy ###
+        with open(text_corpus_path, 'rb') as f:
+            text = pickle.load(f)
+        df = pd.read_pickle(df_cases_file)
+        clusters = df['label_'+label_set].tolist()
+        frame = pd.DataFrame({'text': text, 'cluster': clusters}, index=[clusters])
+        ent = cluster_entropy(frame)
+        ent_mean, ent_std = corpus_entropy(text)
+        entropy_scores.append(round(ent[0] / ent_mean, round_to))
+
+    for methodname in skipped_methods:
+        methodnames.remove(methodname)
+
+    methodnames = [n.replace("_metrics", "") for n in methodnames]
+    df = pd.DataFrame({'cluster method': methodnames, 's-score': s_scores,
+                       'cls accuracy': cls_ac_scores, 'rel entropy': entropy_scores,
+                       'clusters': cluster_nums, 'corpus size': report_nums})
+    df.sort_values(by=sort_table_by, inplace=True, ascending=False)
+
+    latex_table = df.to_latex(index=False)
+    print("%================== clustering metric scores =================")
+    print(latex_table)
+    print("%===================================\n\n")
+
+    with open(table_save_path+'_latex.txt', 'w') as f:
+        f.write(latex_table)
+
+    df.to_excel(table_save_path+'.xlsx')
+
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/TextClustering/clustering_pipeline.py
+++ b/TextClustering/clustering_pipeline.py
+import os, sys
+
+# params:
+path2corpus_bow_preprocessed_diagnosis = 'database/bow_prepro_diag.pkl'
+path2corpus_embedding_preprocessed_diagnosis = 'database/embedding_prepro_diag.pkl'
+
+# check if we are at correct working directory:
+workdir = os.getcwd()
+if not workdir[-len('nlp-in-diagnostic-texts-from-nephropathology'):] == 'nlp-in-diagnostic-texts-from-nephropathology':
+    print(workdir + " is the wrong working directory.")
+    print("please make shure to run this script with working directory '.../path/to/nlp-in-diagnostic-texts-from-nephropathology'.")
+    exit(1)
+
+# add some folders for metrics, plot, tables etc:
+if not os.path.isdir('TextClustering/cluster_metrics'):
+    os.makedirs('TextClustering/cluster_metrics')
+if not os.path.isdir('TextClustering/tables'):
+    os.makedirs('TextClustering/tables')
+if not os.path.isdir('TextClustering/plots'):
+    os.makedirs('TextClustering/plots')
+if not os.path.isdir('TextClustering/plots/histograms'):
+    os.makedirs('TextClustering/plots/histograms')
+if not os.path.isdir('TextClustering/plots/UMAP'):
+    os.makedirs('TextClustering/plots/UMAP')
+if not os.path.isdir('TextClustering/plots/PCA'):
+    os.makedirs('TextClustering/plots/PCA')
+
+# Construct clustering pipeline. This is a suggestion how to use all the scripts.
+# I also recommend to run each clustering script one by on to fintune the clusterings (with argument --find_k_value)
+script_queue = [
+f"python TextClustering/basedOn_BOW/kmeans_Diagnosis.py --path2corpus {path2corpus_bow_preprocessed_diagnosis} --k_value {10}",
+f"python TextClustering/basedOn_BOW/LDA_Diagnosis.py --path2corpus {path2corpus_bow_preprocessed_diagnosis} --k_value {12}",
+f"python TextClustering/basedOn_BOW/HDBSCAN_Diagnosis.py --path2corpus {path2corpus_bow_preprocessed_diagnosis} --k_value {10}",
+f"python TextClustering/basedOn_BOW/GSDPMM_Diagnosis.py --path2corpus {path2corpus_bow_preprocessed_diagnosis} --k_value {14}",
+
+f"python TextClustering/basedOn_Embedding/BERT_Diagnosis.py --path2corpus {path2corpus_embedding_preprocessed_diagnosis} --do_embedding --model2use German_BERT --k_value {17}",
+f"python TextClustering/basedOn_Embedding/BERT_Diagnosis.py --path2corpus {path2corpus_embedding_preprocessed_diagnosis} --do_embedding --model2use Patho_BERT --k_value {8}",
+f"python TextClustering/basedOn_Embedding/top2vec_Diagnosis.py --path2corpus {path2corpus_embedding_preprocessed_diagnosis} --model2use doc2vec --k_value {7}",
+
+"python TextClustering/cluster_scores2latextable.py",
+"python TextClustering/plot_clustersets.py",
+"python TextClustering/generate_topicwords.py",
+"python TextClustering/clusterset_histos.py"
+]
+
+for script in script_queue:
+    print("\n########################################### executing ###########################################")
+    print(script)
+    print("####################################################################################################\n")
+    os.system(script)
+
--- a/TextClustering/clusterset_histos.py
+++ b/TextClustering/clusterset_histos.py
+import database_preparation.utils_labeled_datasets as dt
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import sys, os
+import argparse
+
+sys.path.append(os.getcwd())
+parser = argparse.ArgumentParser()
+parser.add_argument("--df_cases_file", default="database/df_cases.pkl")
+args = parser.parse_args()
+
+plot_author_histos = False
+cluster = 2
+
+clustersets = ["HDBSCAN", "KMeans", "LDA", "GSDPMM",
+               "top2vec", "Patho_BERT", "German_BERT"]
+
+df = pd.read_pickle(args.df_cases_file)
+authors_labels = df["label_author"]
+
+# plot histograms: how much docs do have the same label=cluster-index?
+for i,label_set in enumerate(clustersets):
+
+    try:
+        cluster_labels = dt.label_list_as_int_list(df['label_' + label_set])
+    except:
+        print(f"skipping {label_set}. it is not in the df_cases_file.")
+        continue
+    if plot_author_histos:
+
+        authors_of_cluster = [authors_labels[i] for i, label in enumerate(cluster_labels) if
+                              label == cluster]
+        authors = np.asarray(authors_of_cluster)
+        x = [-1,0,1,2,3]
+        h = []
+        for l in x:
+            h.append(sum([1 for a in authors if a == l]))
+        plt.bar(x, height=h)
+
+        plt.title(label_set + " authors in cluster " + str(cluster))
+        file_path = 'TextClustering/plots/histograms/histogram_' + label_set + "_cluster" + str(cluster) + "_authors.png"
+
+    else:
+        labels = np.asarray([l for l in cluster_labels if l != -1])
+
+        label_num = dt.get_amount_unique_labels(label_set)
+        x = np.arange(label_num)
+        h = []
+        for l in x:
+            h.append(sum([1 for label in labels if label == l]))
+        plt.bar(x, height=h)
+        plt.xticks(x, x)
+        plt.title(label_set)
+
+        plt.title(label_set)
+        file_path = 'TextClustering/plots/histograms/histogram_' + label_set + ".png"
+
+    plt.xticks(x, x)
+
+    plt.savefig(file_path, dpi=600)
+    plt.close()
+    plt.clf()
+
+    print(f"generated {file_path}")
+
--- a/TextClustering/generate_topicwords.py
+++ b/TextClustering/generate_topicwords.py
+from TextClustering.utils_wordlist import generate_save_topicwords
+import pandas as pd
+from database_preparation.utils_labeled_datasets import label_list_as_int_list
+from database_preparation.preprocess import get_metadata
+import pickle
+import openpyxl
+
+# parameters:
+df_cases_file = "database/df_cases.pkl"
+print_latex = False
+filter_stop_words = True
+path2umap_pics = 'TextClustering/plots/UMAP/'
+save_umap_picture_in_table = True
+path2corpus_bow_preprocessed = 'database/bow_prepro_diag.pkl'
+path2corpus_embedding_preprocessed = 'database/embedding_prepro_diag.pkl'
+
+####### functions ##########
+
+
+def main():
+    cluster_sets = ['KMeans', 'LDA', 'HDBSCAN', 'German_BERT', 'Patho_BERT', 'top2vec', 'GSDPMM']
+    # cluster_sets = ['German_BERT']
+
+    df_cases = pd.read_pickle(df_cases_file)
+
+    for cluster_set in cluster_sets:
+        # re-generate the topic words:
+        excel_file_path = 'TextClustering/tables/WordsPerCluster_' + cluster_set + '.xlsx'
+
+        # convert nan-values in int(-1):
+        try:
+            clusters = label_list_as_int_list(df_cases['label_' + cluster_set])
+        except:
+            print(f"skipping {cluster_set}. it is not in the df_cases_file.")
+            continue
+
+        if cluster_set in ['German_BERT', 'Patho_BERT', 'top2vec']:
+            text_corpus_path = path2corpus_embedding_preprocessed
+        else:
+            text_corpus_path = path2corpus_bow_preprocessed
+
+        meta_params = get_metadata(text_corpus_path)
+
+        with open(text_corpus_path, 'rb') as f:
+            diag_lst = pickle.load(f)
+
+        # do not apply stopwordfilterg if it is already stopword filtered!
+        generate_save_topicwords(clusters, diag_lst, save_excel_file_path=excel_file_path,
+                                 n_words=10, print_latex_table=print_latex,
+                                 filter_stop_words=filter_stop_words and not meta_params['stopword_filtered'])
+        if save_umap_picture_in_table:
+            pic_path = path2umap_pics + cluster_set + "_UMAP.png"
+            try:
+                img = openpyxl.drawing.image.Image(pic_path)
+                wb = openpyxl.load_workbook(excel_file_path)
+                ws = wb.create_sheet("umap")
+                img.anchor = 'A1'
+                img.width = img.width / 2
+                img.height = img.height / 2
+                ws.add_image(img)
+                wb.save(excel_file_path)
+                print(f"Generated {excel_file_path}")
+            except:
+                print("could not load " + pic_path)
+                print("therefore, cant place umap picture into " + excel_file_path)
+
+
+    ######### topic words of authors #########
+    clusters = label_list_as_int_list(df_cases['label_author'])
+    excel_file_path = 'TextClustering/tables/WordsPerCluster_authors.xlsx'
+    with open(path2corpus_bow_preprocessed, 'rb') as f:
+        diag_lst = pickle.load(f)
+
+    generate_save_topicwords(clusters, diag_lst, save_excel_file_path=excel_file_path,
+                             n_words=20, print_latex_table=print_latex,
+                             filter_stop_words=False)
+
+
+if __name__ == '__main__':
+    main()
--- a/TextClustering/plot_clustersets.py
+++ b/TextClustering/plot_clustersets.py
+import pandas as pd
+from TextClustering.utils_metrics import cluster_scatter_plot
+import numpy as np
+from database_preparation.utils_labeled_datasets import label_list_as_int_list
+
+clustersets = ["GSDPMM", "KMeans", "LDA", "HDBSCAN",
+               "top2vec", "Patho_BERT", "German_BERT"]
+plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)",
+               "LDA (UMAP representation)", "HDBSCAN (UMAP representation)",
+               "top2vec (UMAP representation)", "Patho-BERT (UMAP representation)",
+               "German-BERT (UMAP representation)"]
+df_cases_file = "database/df_cases.pkl"
+
+def save_umap_plot(clustersetname, df, title=None):
+    if not 'label_' + clustersetname in df:
+        print("skipping " + clustersetname + ", it is not in df_cases_file:")
+        print(df)
+        return
+    predictedCluster_text_features = label_list_as_int_list(df['label_' + clustersetname])
+
+    try:
+        umap_text_features2D = np.asarray([[e for e in df['umapX_' + clustersetname]],
+                                           [e for e in df['umapY_' + clustersetname]]])
+    except:
+        print("there is no umapX_" + clustersetname + " in database/df_cases.pkl. => skipping")
+        return
+    umap_text_features2D = np.transpose(umap_text_features2D)
+
+    cluster_scatter_plot(umap_text_features2D, predictedCluster_text_features,
+                 "TextClustering/plots/UMAP/" + clustersetname + "_UMAP.png",
+                         show_plot=False, colorblindfriendly=False, fig_title=title)
+    if 'label_author' in df:
+        author_labels = df["label_author"]
+        cluster_scatter_plot(umap_text_features2D, author_labels,
+                     "TextClustering/plots/UMAP/" + clustersetname + "_UMAP_authors.png",
+                             show_plot=False, colorblindfriendly=True, number_data_points=False
+                             , fig_title=title + ", colored by authors")
+    if 'label_golden' in df:
+        golden_labels = df["label_golden"]
+        cluster_scatter_plot(umap_text_features2D, golden_labels,
+                     "TextClustering/plots/UMAP/" + clustersetname + "_UMAP_goldenlabel.png",
+                             show_plot=False, colorblindfriendly=True
+                             , fig_title=title + " colored with golden labels")
+
+def main():
+    df = pd.read_pickle(df_cases_file)
+    for clustersetname in clustersets:
+        if clustersetname in clustersets:
+            title = plot_titles[clustersets.index(clustersetname)]
+        else:
+            title = None
+        save_umap_plot(clustersetname, df, title)
+
+    # plot author-colored and cluster-colored lda clustersets as pca representation:
+    if 'label_LDA' in df and 'pcaX_LDA' in df:
+        predictedCluster_text_features = df['label_LDA']
+
+        features2D = np.asarray([[e for e in df['pcaX_LDA']],
+                                 [e for e in df['pcaY_LDA']]])
+        features2D = np.transpose(features2D)
+
+        cluster_scatter_plot(features2D, predictedCluster_text_features,
+                     "TextClustering/plots/PCA/LDA_PCA.png",
+                             show_plot=False, colorblindfriendly=False,
+                             fig_title="LDA (PCA representation)")
+        cluster_scatter_plot(features2D, df["label_author"],
+                     "TextClustering/plots/PCA/LDA_PCA_authors.png",
+                             show_plot=False, colorblindfriendly=True,
+                             number_data_points=False, fig_title='LDA (PCA representation), colored by authors')
+
+if __name__ == '__main__':
+    main()
--- a/TextClustering/topicwords_table_converter.py
+++ b/TextClustering/topicwords_table_converter.py
+import openpyxl
+from TextClustering.utils_wordlist import get_top_cluster_words_as_latex_table
+from googletrans import Translator  # use pip install googletrans==3.1.0a0, 3.0 version is broken
+from utils_general import custom_translation
+
+path2table = "WordsPerCluster_HDBSCAN.xlsx"
+
+green = 'FF00FF00'
+blue = 'FF4A86E8'
+orange = 'FFFF9900'
+black = '1'
+latex_weak_word = '\\weakcolor'
+latex_strong_word = '\\strongcolor'
+
+
+def color2latex_color(color):
+    if color == green:
+        return latex_strong_word
+    if color == blue:
+        return latex_weak_word
+    if color == orange:
+        return latex_weak_word
+    # print(f"unknown color: {color}")
+    return None
+
+
+def get_annotated_exceltable(ws):
+    words_list = []
+    topics = []
+    colors = []
+    for idx, col in enumerate(ws.iter_rows(min_row=2, max_row=25, min_col=1, max_col=11)):
+        if col[0].value is None:
+            break
+        words_list.append([])
+        colors.append([])
+        for i, cell in enumerate(col):
+            if i == 0:
+                topics.append((cell.value, color2latex_color(cell.font.color.rgb)))
+            else:
+                words_list[idx].append(cell.value)
+                colors[idx].append(color2latex_color(cell.font.color.rgb))
+
+    # return get_top_cluster_words_as_latex_table(words_list, colors, topics)
+    return words_list, colors, topics
+
+
+def main():
+    wb = openpyxl.load_workbook(path2table)
+    extraction_methods = ['tf-idf', 'SVM']
+    cluster_method = 'HDBSCAN'
+    anotate_svm_as_tfidf = True
+    print_also_translated_tables = True
+
+    translator = Translator()
+
+    words_list_tfidf = []
+    colorstfidf = []
+    topicstfidf = []
+    for i, extraction_method in enumerate(extraction_methods):
+        ws = wb[['TFIDF-based', 'svm-based'][i]]
+        words_list, colors, topics = get_annotated_exceltable(ws)
+        if anotate_svm_as_tfidf:
+            if extraction_method != 'tf-idf':
+                topics = topicstfidf
+                for j, words in enumerate(words_list):
+                    for k, word in enumerate(words):
+                        if word in words_list_tfidf[j]:
+                            colors[j][k] = colorstfidf[j][words_list_tfidf[j].index(word)]
+            else:
+                words_list_tfidf, colorstfidf, topicstfidf = words_list, colors, topics
+
+        # print german topic words:
+        label = 'table_cluster_topics_' + cluster_method + '_' + extraction_method + '_ger'
+        # print("\n processing " + label+"...\n")
+        description = f'Annotated German topic words, extracted from the {cluster_method} cluster-set, ' \
+                      f'using the {extraction_method} based extraction method.'
+        latex = get_top_cluster_words_as_latex_table(words_list, colors, topics).replace('DESCRIPTON',
+                                                                                         description).replace(
+            'EXTRACTIONMETHOD', extraction_method).replace(
+            'LABEL', label
+        )
+        print(latex)
+
+        # print english topic words:
+        if print_also_translated_tables:
+            description = f'Annotated topic words (translated from German to English), ' \
+                          f'extracted from the {cluster_method} cluster-set, ' \
+                          f'using the {extraction_method} based extraction method.'
+            label = 'table_cluster_topics_' + cluster_method + '_' + extraction_method + '_eng'
+
+            word_list_eng = [[word if word.lower() not in custom_translation.keys() else custom_translation[word.lower()]
+                              for word in words] for words in words_list]
+            topics_eng = [(translator.translate(topic[0], src='de').text, topic[1]) for topic in topics]
+            latex = get_top_cluster_words_as_latex_table(word_list_eng, colors, topics_eng).replace('DESCRIPTON',
+                                                                                                    description).replace(
+                'EXTRACTIONMETHOD', extraction_method).replace(
+                'LABEL', label
+            )
+            print(latex)
+
+
+if __name__ == '__main__':
+    main()
--- a/TextClustering/utils_metrics.py
+++ b/TextClustering/utils_metrics.py
+
+#%% import
+from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
+from validclust import cop, dunn
+from sklearn.metrics import pairwise_distances
+import pandas as pd
+import os
+
+#%% class definition
+class ClusterMetrics:
+
+    def __init__(self, feature_matrix, feature_label, file_name = "cluster_metrics.pkl"):
+
+        self.file_name = file_name
+
+        self.feature_matrix = feature_matrix
+
+        if (type(feature_matrix).__name__) == "csr_matrix":
+            self.__feature_matrix_array__ = feature_matrix.toarray()
+        elif (type(feature_matrix).__name__) == "list":
+            self.__feature_matrix_array__ = np.array(feature_matrix)
+        else:
+           self.__feature_matrix_array__  = feature_matrix
+
+        self.feature_label = feature_label
+        self.__feature_label_array = np.array(feature_label)
+
+        self.__distance_matrix__ = pairwise_distances(self.__feature_matrix_array__)
+
+        # calculate Silhouette Coefficien (values -1 to 1)
+        self.s_score = silhouette_score(feature_matrix, feature_label)
+
+        # caculate Calinski-Harabasz Index (the higher the value, the better)
+        self.ch_index = calinski_harabasz_score(self.__feature_matrix_array__ , feature_label)
+
+        # calcualte the Davies-Bouldin Index (the highter, the better)
+        self.db_score = davies_bouldin_score(self.__feature_matrix_array__, feature_label)
+
+        # calculate COP CVI
+        self.cop = cop(self.__feature_matrix_array__, self.__distance_matrix__, self.__feature_label_array)
+
+        # calculate Dunn CVI
+        self.dunn = dunn(self.__distance_matrix__, self.__feature_label_array)
+
+        # place for entropy
+        self.entropy = None
+
+        self.svm_accuracy = None
+
+    def write_to_file(self):
+
+        results = [np.round(self.s_score,3),
+                   np.round(self.ch_index,3),
+                   np.round(self.db_score,3),
+                   np.round(self.cop,3),
+                   np.round(self.dunn,3),
+                   self.entropy,
+                   self.svm_accuracy]
+
+        head, tail = os.path.split(self.file_name)
+        tail = tail[:-4]
+
+        df = pd.DataFrame(results,
+                          index =['s-score', 'ch-index', 'db-score', 'cop', 'dunn-score', 'entropy', 'svm-accuracy'],
+                          columns =[tail])
+        df.to_pickle(self.file_name)
+
+        print(df)
+
+    def __str__(self):
+        return "s-score: " + str(np.round(self.s_score,2)) + "[-1:1]" + "\n" + \
+               "ch-index: " + str(np.round(self.ch_index,2)) + " [0:]" "\n" + \
+               "db-score: " + str(np.round(self.db_score, 2)) + "[0:]" "\n" + \
+                "cop: " + str(np.round(self.cop, 2)) + " []" "\n" + \
+                "dunn: " + str(np.round(self.dunn, 2)) + " []" "\n" + \
+               "entropy: " + str(self.entropy) + " []" "\n" + \
+               "svm-accuracy: " + str(self.svm_accuracy) + " []"
+
+            #%% define plot functions for PCA
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+
+def plot_pca(text_features, labels, file_path = [], show_plot = True):
+
+    pca = PCA(n_components=2)
+    reduced_features = pca.fit_transform(text_features)
+
+    plt.close()
+    plt.scatter(x=reduced_features[:, 0], y=reduced_features[:, 1],
+                c=np.int8(labels), cmap="tab20")
+    plt.colorbar()
+    plt.title('PCA-representation')
+
+    if bool(file_path):
+        plt.savefig(file_path)
+
+    if show_plot:
+        plt.show()
+
+#%% define plot function for T-SNE
+from sklearn.manifold import TSNE
+
+def plot_tsne(text_features, labels, file_path = [], show_plot = True):
+
+    tsne = TSNE(n_components=2, verbose=1, random_state=123)
+    reduced_features = tsne.fit_transform(text_features)
+
+    plt.close()
+    plt.scatter(x=reduced_features[:, 0], y=reduced_features[:, 1],
+                c=np.int8(labels), cmap="tab20")
+    plt.colorbar()
+    plt.title('T-SNE-representation')
+    if bool(file_path):
+        plt.savefig(file_path)
+    if show_plot:
+        plt.show()
+
+#%% define plot function for UMAP
+import umap
+import seaborn as sns
+def cluster_scatter_plot(umap_text_features2D, labels, file_path = [],
+                         show_plot=True, colorblindfriendly=True,
+                         number_data_points=True, fig_title=None):
+
+    '''umap_text_features2D = umap.UMAP(n_neighbors=15,
+                                     n_components=2,
+                                     min_dist=0.0, metric='cosine').fit_transform(text_features)'''
+
+    plt.close()
+
+    # plot unvalid labeled datapoints in black:
+    x = [e for i, e in enumerate(umap_text_features2D[:, 0]) if labels[i] == -1]
+    y = [e for i, e in enumerate(umap_text_features2D[:, 1]) if labels[i] == -1]
+    if len(x) > 0:
+        plt.scatter(x=x, y=y, c='black', marker='.')
+
+    # plot valid labeled datapoints:
+    x_val = [e for i,e in enumerate(umap_text_features2D[:, 0]) if labels[i] != -1]
+    y_val = [e for i,e in enumerate(umap_text_features2D[:, 1]) if labels[i] != -1]
+
+    valid_labels = [l for l in labels if l != -1]
+    if colorblindfriendly:# use colorblind palette, it has 10 colors
+        style = []
+        for label in valid_labels:
+            if label <= 9: #0-9
+                style.append(0)
+            elif label >= 20: #20-inf
+                style.append(1)
+            else: #10-19
+                style.append(2)
+        sns.scatterplot(x=x_val, y=y_val, hue=valid_labels,
+                        palette="colorblind", style=style,
+                        legend=True, linewidth=.3)
+        if number_data_points:
+            nummerate_clusters_in_plot(x_val, y_val, valid_labels)
+
+    else: # use tap20, it has 20 different colors
+
+
+        x = [e for i, e in enumerate(x_val) if valid_labels[i] <= 19]
+        y = [e for i, e in enumerate(y_val) if valid_labels[i] <= 19]
+        c = [e for e in valid_labels if e <= 19]
+        plt.scatter(x=x,
+                    y=y,
+                    c=np.int8(c),
+                    cmap="tab20", edgecolors='white', linewidth=.3
+                    , marker='o')
+        if number_data_points:
+            nummerate_clusters_in_plot(x,y,c)
+        '''plt.legend(handles=scatter.legend_elements()[0],
+                   labels=[str(l) for l in c], loc="best")'''
+        plt.colorbar(values=[int(e) for e in np.unique(np.asarray(c))])
+
+        c = [e for e in valid_labels if e > 19]
+        if len(c)>0:
+            x = [e for i, e in enumerate(x_val) if valid_labels[i] > 19]
+            y = [e for i, e in enumerate(y_val) if valid_labels[i] > 19]
+            plt.scatter(x=x, y=y,
+                        c=np.int8(c),
+                        cmap="tab20", edgecolors='white', linewidth=.3
+                        , marker='P')
+            if number_data_points:
+                nummerate_clusters_in_plot(x, y, c)
+
+
+    if fig_title is None:
+        if bool(file_path):
+            import os
+            fig_title = os.path.basename(file_path)
+        else:
+            fig_title = "UMAP"
+
+    plt.title(fig_title.replace(".png",""))
+
+    if bool(file_path):
+        print("generated "+file_path)
+        plt.savefig(file_path,dpi=300)
+    if show_plot:
+        plt.show()
+
+
+def nummerate_clusters_in_plot(x,y,labels):
+    annotated_labels = []
+    for i, label in enumerate(labels):
+        if label not in annotated_labels:
+            plt.annotate(label, (x[i], y[i]))
+            annotated_labels.append(label)
+
+def Jaccard_Similarity(doc1, doc2):
+
+    if isinstance(doc1, list):
+        doc1 = " ".join(doc1)
+        doc2 = " ".join(doc2)
+
+    # List the unique words in a document
+    words_doc1 = set(doc1.lower().split())
+    words_doc2 = set(doc2.lower().split())
+
+    #print(words_doc1)
+    #print(words_doc2)
+
+    # Find the intersection of words list of doc1 & doc2
+    intersection = words_doc1.intersection(words_doc2)
+
+    # Find the union of words list of doc1 & doc2
+    union = words_doc1.union(words_doc2)
+
+    # Calculate Jaccard similarity score
+    # using length of intersection set divided by length of union set
+    return float(len(intersection)) / len(union)
+
+#%%
+import numpy as np
+
+def get_distance_matrix(str_list):
+
+    dist_matrix = np.zeros(shape=(len(str_list), len(str_list)))
+
+    # calculate the lower triangle
+    for i in range(0, len(str_list)):
+        for j in range(i+1, len(str_list)):
+                dist_matrix[i][j] = Jaccard_Similarity(str_list[i], str_list[j])
+    # fill the upper triangle
+    for i in range(0, len(str_list)):
+        for j in range(0, len(str_list)):
+            if i == j:
+                dist_matrix[i][j] = 0
+            elif i > j:
+                dist_matrix[i][j] = dist_matrix[j][i]
+
+    return dist_matrix
--- a/TextClustering/utils_wordlist.py
+++ b/TextClustering/utils_wordlist.py
+# -*- coding: iso-8859-1 -*-
+
+from database_preparation.utils_stringpreparation import get_most_frequent_words
+import numpy as np
+import pandas
+import yake
+import nltk
+import openpyxl as pxl
+from database_preparation.utils_labeled_datasets import text_label_2_labeled_dataset
+import pandas as pd
+from database_preparation.utils_labeled_datasets import label_list_as_int_list
+from database_preparation.stop_word_list import filter_stopwords
+
+
+def get_nwordlist(text_lst, cluster_lst, n_words=10,
+                  excel_file_path=[],
+                  method="frequency-based",
+                  filter_stop_words=True):
+    """
+    - if file_path given, the wordlist will be saved as .xlsx with sheet name = method
+    """
+
+    ignore_outlier = True
+
+    # %% prepare parameter
+    docs = {'text': text_lst, 'cluster': cluster_lst}
+    n_cluster = np.unique(cluster_lst)
+    frame = pandas.DataFrame(docs, index=[cluster_lst])
+
+    text1 = np.asarray(text_lst[0])
+    text_lst_is_tokenized = bool(text1.ndim)
+
+    if method in ['svm-based', 'LR-based']:
+
+        ###### prepare text data: ######
+        if text_lst_is_tokenized:
+            if filter_stop_words:
+                dataset = text_label_2_labeled_dataset(
+                    [filter_stopwords(text) for text in text_lst], cluster_lst
+                )
+            else:
+                dataset = text_label_2_labeled_dataset(text_lst, cluster_lst)
+        else:
+            # print("passed text list is not tokenized. Tokenizing it now with nltk...")
+            # tokenize
+            tokenized_texts = []
+            for t_text in text_lst:
+                if filter_stop_words:
+                    tokenized_texts.append(filter_stopwords(nltk.tokenize.word_tokenize(t_text, language='german')))
+                else:
+                    tokenized_texts.append(nltk.tokenize.word_tokenize(t_text, language='german'))
+            dataset = text_label_2_labeled_dataset(tokenized_texts, cluster_lst)
+
+        ###### train svm: ######
+        from sklearn.pipeline import make_pipeline
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        from sklearn.svm import LinearSVC
+        from sklearn.linear_model import LogisticRegressionCV
+        def identity(words):
+            return words
+
+        def get_trained_svm(texts, labels):
+            vec = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
+            # svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
+            # lsa = make_pipeline(vec, svd)
+            # clf = SVC(probability=True, kernel="linear")
+            clf = LinearSVC()
+            # pipe = make_pipeline(lsa, clf)
+            pipe = make_pipeline(vec, clf)
+            pipe.fit(texts, labels)
+            return pipe, vec, clf
+
+        def get_trained_LR(text, labels):
+            # vec = CountVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
+            vec = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
+            clf = LogisticRegressionCV()
+            pipe = make_pipeline(vec, clf)
+            pipe.fit(text, labels)
+            return pipe, vec, clf
+
+        if method == 'svm-based':
+            pipe, vec, clf = get_trained_svm(dataset['text'], dataset['label'])
+        else:
+            pipe, vec, clf = get_trained_LR(dataset['text'], dataset['label'])
+
+        def get_correct_predictions(texts, labels, pipe):
+            y_preds = pipe.predict(texts)
+            correct_predictions = []
+            for i, y_pred in enumerate(y_preds):
+                if y_pred == labels[i]:
+                    correct_predictions.append(i)
+            return correct_predictions
+
+        correct_predictions = get_correct_predictions(dataset['text'], dataset['label'], pipe)
+        '''print("found " + str(len(correct_predictions)) + '/' +
+              str(len(dataset['text'])) + " correct predicted docs.")'''
+
+        del clf, vec, pipe
+        if method == 'svm-based':
+            pipe, vec, clf = get_trained_svm(dataset[correct_predictions]['text'],
+                                             dataset[correct_predictions]['label'])
+        else:
+            pipe, vec, clf = get_trained_LR(dataset[correct_predictions]['text'],
+                                            dataset[correct_predictions]['label'])
+
+        '''print("using "+method+" which predicted " +
+              str(len(
+                  get_correct_predictions(dataset[correct_predictions]['text'], dataset[correct_predictions]['label'], pipe)))
+              + '/' + str(len(dataset[correct_predictions]['text'])) + " documents correctly.")'''
+
+        ########## weight analysis ##############
+        # get feature importance:
+        feature_names = vec.get_feature_names_out()
+        top_word_lists = []
+        for i, coef in enumerate(clf.coef_):
+            if clf.__class__.__name__ == 'SVC':
+                coef = coef.toarray()
+                weights = list(zip(feature_names, [coef[0, i] for i in range(coef.shape[1])]))
+            else:
+                weights = list(zip(vec.get_feature_names_out(), coef))
+            most_positives = sorted(weights, key=lambda x: -x[1])[:n_words]
+            most_netatives = sorted(weights, key=lambda x: x[1])[:n_words]
+            top_word_lists.append([tup[0] for tup in most_positives])
+
+        '''f_mnames = vec.get_feature_names_out()
+        coefs = clf.coef_.ravel()
+        top_positive_coefficients = np.argsort(coefs)[-n_words:]
+        top_negative_coefficients = np.argsort(coefs)[:n_words]
+        top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
+        print(f_mnames[top_coefficients])'''
+
+        save_topwordlist_as_excel(excel_file_path, top_word_lists, method)
+
+        return top_word_lists
+
+    # %% prepare different methods
+    if method == "TFIDF-based":
+
+        cluster_words = []
+        clusters = list(np.unique(cluster_lst))
+
+        if ignore_outlier:  # if yes: remove cluster -1
+            if -1 in clusters:
+                clusters.remove(-1)
+
+        if text_lst_is_tokenized:
+            for i_cluster in clusters:
+                t_frame = frame[frame['cluster'] == i_cluster]
+                if filter_stop_words:
+                    t_text = []
+                    for text in t_frame['text'].to_list():
+                        filtered = filter_stopwords(text)
+                        t_text.append(filtered)
+                else:
+                    t_text = t_frame['text'].to_list()
+                words_of_cluster = [word for text in t_text for word in text]
+                cluster_words.append(words_of_cluster)
+        else:
+            for i_cluster in clusters:
+                t_frame = frame[frame['cluster'] == i_cluster]
+                t_text = []
+                for text in t_frame['text'].to_list():
+                    tokenized = nltk.tokenize.word_tokenize(text, language='german')
+                    if filter_stop_words:
+                        t_text.append(filter_stopwords(tokenized))
+                    else:
+                        t_text.append(tokenized)
+
+                words_of_cluster = [word for text in t_text for word in text]
+                cluster_words.append(words_of_cluster)
+
+    if method == "yake":
+        def flatten(lst):
+            text = ''
+            for t_text in lst:
+                text += str(t_text)
+            return text
+
+        kw_extractor = yake.KeywordExtractor(lan='German',
+                                             n=1,
+                                             dedupLim=0.9,
+                                             dedupFunc='seqm',
+                                             windowsSize=1,
+                                             top=n_words, features=None)
+
+    # %% perform it for TFIDF
+    if method == "TFIDF-based":
+
+        # %% prepare the corpus for tfidf
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        vectorizer = TfidfVectorizer()
+        corpus = [str(i) for i in cluster_words]
+        vectors = vectorizer.fit_transform(corpus)
+        names = vectorizer.get_feature_names_out()
+        data = vectors.todense().tolist()
+        df = pandas.DataFrame(data, columns=names)
+
+        # %% create the words list
+        word_list = []
+        for i in df.iterrows():
+            i_words = i[1].sort_values(ascending=False)[:n_words]
+            word_list.append(i_words.index.to_list())
+
+    else:
+
+        # %% perform it for the other approaches
+        clusters = []
+        word_list = []
+
+        for i_cluster in n_cluster:
+
+            # ignore outlier
+            if ignore_outlier:
+                if i_cluster < 0:
+                    continue
+
+            t_frame = frame[frame['cluster'] == i_cluster]
+
+            if method == "frequency-based":
+                if text_lst_is_tokenized:
+                    if filter_stop_words:
+                        t_text = [filter_stopwords(text) for text in t_frame['text'].to_list()]
+                    else:
+                        t_text = t_frame['text'].to_list()
+                else:
+                    t_text = []
+                    for text in t_frame['text'].to_list():
+                        tokenized = nltk.tokenize.word_tokenize(text, language='german')
+                        if filter_stop_words:
+                            t_text.append(filter_stopwords(tokenized))
+                        else:
+                            t_text.append(tokenized)
+                top_words = get_most_frequent_words(str(t_text), n_words)
+
+            elif method == "yake":
+                if text_lst_is_tokenized:
+                    # join word-lists into single string
+
+                    if filter_stop_words:
+                        cluster_text_list = [filter_stopwords(text) for text in t_frame['text'].to_list()]
+                    else:
+                        cluster_text_list = t_frame['text'].to_list()
+                    t_text = [' '.join(text) for text in cluster_text_list]
+                else:
+                    if filter_stop_words:  # tokenize->filterstopwords->join
+                        t_text_tokenized = []
+                        for text in t_frame['text'].to_list():  # tokenize+filtering
+                            tokenized = nltk.tokenize.word_tokenize(text, language='german')
+                            t_text_tokenized.append(filter_stopwords(tokenized))
+                        t_text = [' '.join(text) for text in t_text_tokenized]  # join
+                    else:
+                        t_text = t_frame['text'].to_list()
+                keywords = kw_extractor.extract_keywords(flatten(t_text))
+                top_words = [i_key[0] for i_key in keywords]
+
+            clusters.append(i_cluster)
+            word_list.append(top_words)
+
+    # %% how to get n_words in an array
+    for i in range(0, len(word_list)):
+        t_token = np.array(word_list[i])
+
+        if len(t_token) < n_words + 1:
+            t_token = np.append(t_token, np.repeat(np.nan, n_words - len(t_token)))
+
+        t_token = t_token.reshape((1, -1))
+        if i == 0:
+            token_list = t_token
+        else:
+            token_list = np.concatenate((token_list, t_token), axis=0)
+
+    # %% save it to the excel (bad style the way to the dark side is...)
+    save_topwordlist_as_excel(excel_file_path, token_list, method)
+
+    # %% return the results
+    return word_list
+
+
+def save_topwordlist_as_excel(file_path, token_list, sheet_name):
+    if bool(file_path):
+        try:  # if excelfile does exist, append new sheet to workbook:
+            excel_book = pxl.load_workbook(file_path)
+            if sheet_name in excel_book.get_sheet_names():
+                excel_book.remove_sheet(sheet_name)
+            with pandas.ExcelWriter(file_path, engine='openpyxl', if_sheet_exists=None) as writer:
+                writer.book = excel_book
+                writer.sheets = {
+                    worksheet.title: worksheet
+                    for worksheet in excel_book.worksheets
+                }
+
+                pandas.DataFrame(token_list).to_excel(writer, sheet_name)
+                writer.save()
+
+        except:  # otherwise: create new workbook and save
+            pandas.DataFrame(token_list).to_excel(file_path, sheet_name=sheet_name)
+
+
+def generate_save_topicwords(predictedClusters, text_lst, save_excel_file_path,
+                             n_words=10, print_latex_table=False,
+                             extraction_methods=['TFIDF-based', 'frequency-based', 'yake', 'svm-based'],
+                             filter_stop_words=True):
+    '''
+    creates n_words most relevant topic-words with tfifg, tf and yake
+    and saves it as .xlsx and as latex table
+    '''
+
+    latextable = ''
+    for method in extraction_methods:
+        word_list = get_nwordlist(text_lst, predictedClusters,
+                                  n_words=n_words,
+                                  excel_file_path=save_excel_file_path,
+                                  method=method,
+                                  filter_stop_words=filter_stop_words)
+
+        latextable = latextable + "\n%================== " + save_excel_file_path + " " + method + " ================="
+        latextable = latextable + get_top_cluster_words_as_latex_table(word_list)
+        latextable = latextable + "%========================================================\n"
+
+    if print_latex_table:
+        print(latextable)
+    with open(save_excel_file_path.replace('.xlsx', '') + '_latex.txt', 'w') as f:
+        f.write(latextable)
+
+
+def get_top_cluster_words_as_latex_table(words_lists, colors=None, cluster_topics=None):
+    first_part = '''
+%%%%%%%%%%%%%%%%%%%%%% LABEL %%%%%%%%%%%%%%%%%%%%%%%%%%
+\\begin{table}[!htb] 
+\caption{DESCRIPTON}\label{LABEL}
+\\noindent
+\\begin{tabularx}{\linewidth}{|@{}>{}l|@{\hspace{.5em}}X@{}|}
+\hline
+\\textbf{ cluster index - cluster name }  & \\textbf{keywords according to EXTRACTIONMETHOD} \\\\ \\hline
+'''
+    end_part = '''
+\end{tabularx}
+\end{table}
+'''
+    if colors is None or cluster_topics is None:
+        latex_code_m = ""
+        for i, words in enumerate(words_lists):
+            latex_code_m = latex_code_m + str(i) + "  & " + ", ".join(words) + " \\\\ \hline \n"
+    else:
+        latex_code_m = ""
+        colored_tops = [int(float(top[0])) if top[1] is None else '\\colorbox{' + top[1] + '}{' + top[0] + '}' for top in
+                        cluster_topics]
+        for i, words in enumerate(words_lists):
+            colored_words = [word if colors[i][j] is None else '\\colorbox{'+colors[i][j]+'}{'+word+'}' for j,word in enumerate(words)]
+            latex_code_m = latex_code_m + str(colored_tops[i]) + "  & " + ", ".join(colored_words) + " \\\\ \hline"
+            if i+1 < len(words_lists):
+                latex_code_m = latex_code_m + '\n'
+
+    latex_code = first_part + latex_code_m + end_part
+    latex_code = latex_code.replace("_", "\_")
+
+    # print("================== cluster topics " + cluster_method + " =================")
+    # print(latex_code)
+    # print("========================================================")
+    return latex_code
+
+
+def print_excel_topciwordlist_as_latex(cluster_set, excel_file_path=None, topic_word_method="TFIDF-based"):
+    if excel_file_path == None:
+        excel_file_path = 'TextClustering/tables/WordsPerCluster_' + cluster_set + '_temp.xlsx'
+
+    df_cases = pd.read_pickle("database/df_cases.pkl")
+
+    # convert nan-values in int(-1):
+    clusters = label_list_as_int_list(df_cases['label_' + cluster_set])
+
+    df = pd.read_excel(open(excel_file_path, 'rb'), sheet_name=topic_word_method).T
+    df.drop(['Unnamed: 0'], inplace=True)
+    rename_dic = {}
+    for i in range(np.max(clusters) + 1):
+        rename_dic[i] = ' ' + str(i)
+
+    df.rename(columns=rename_dic, inplace=True)
+
+    latex_code = '''
+    \\begin{table}[h]
+    \caption{DESCRIPTON}\label{table_cluster_topics}
+    \\noindent
+    \\begin{tabularx}{\linewidth}{|@{}>{\\bfseries}l|@{\hspace{.5em}}X@{}|}
+    \hline
+
+    '''
+    latex_code_m = ""
+    for k in rename_dic.values():
+        latex_code_m = latex_code_m + "\\textbf{" + str(k) + "}  & " + ", ".join(df[k]) + " \\\\ \hline \n"
+
+    latex_code = latex_code + latex_code_m + '''
+    \end{tabularx}
+    \end{table}
+    '''
+    latex_code = latex_code.replace("_", "\_")
+
+    print("%================== cluster topics of cluster-set" + cluster_set + " =================")
+    print(latex_code)
+    print("%========================================================")
+
+
+def main():
+    pass
+
+
+if __name__ == '__main__':
+    main()
--- a/database/bow_prepro_desc_meta.json
+++ b/database/bow_prepro_desc_meta.json
+{"source_data": "../DataNephroTexts/description", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
--- a/database/bow_prepro_diag_meta.json
+++ b/database/bow_prepro_diag_meta.json
+{"source_data": "../DataNephroTexts/diagnosis", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
--- a/database/embedding_prepro_desc_meta.json
+++ b/database/embedding_prepro_desc_meta.json
+{"source_data": "../DataNephroTexts/description", "tokenized": false, "cased": true, "stopword_filtered": false, "use_combiner": true, "use_replacer": true, "lemma_mode": 4, "punct_mode": 1, "number_mode": 3}
\ No newline at end of file
--- a/database/embedding_prepro_diag_meta.json
+++ b/database/embedding_prepro_diag_meta.json
+{"source_data": "../DataNephroTexts/diagnosis", "tokenized": false, "cased": true, "stopword_filtered": false, "use_combiner": true, "use_replacer": true, "lemma_mode": 4, "punct_mode": 1, "number_mode": 3}
\ No newline at end of file
--- a/database_preparation/count_oov_cases.py
+++ b/database_preparation/count_oov_cases.py
+
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+import pickle
+
+# script parameters:
+modelname = "bert-base-german-cased"
+path2corpus_embedding_preprocessed_diagnosis = 'database/embedding_prepro_diag.pkl'
+path2corpus_embedding_preprocessed_description = 'database/embedding_prepro_desc.pkl'
+
+tokenizer = AutoTokenizer.from_pretrained(modelname)
+model = AutoModelForMaskedLM.from_pretrained(modelname)
+
+unknown_id = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
+
+with open(path2corpus_embedding_preprocessed_description, 'rb') as f:
+    micro_texts = pickle.load(f)
+
+with open(path2corpus_embedding_preprocessed_diagnosis, 'rb') as f:
+    diag_texts = pickle.load(f)
+
+def find_oov_cases(texts):
+    oov_cases = 0
+    for text_num, text in enumerate(texts):
+        if unknown_id in tokenizer.encode(text):
+            tokens = text.split(" ")
+            for i, token in enumerate(tokens):
+                if unknown_id in tokenizer.encode(token):
+                    oov_cases += 1
+                    print("found OOV case in text " + str(text_num))
+                    print("the word \'" + str(token) + "\' in " + str(tokens[i - 2:i + 2]) + " is OOV")
+
+    return oov_cases
+
+oov_sum = find_oov_cases(micro_texts) + find_oov_cases(diag_texts)
+print("\nFinished. Found " + str(oov_sum) + " OOV cases (see above).")
\ No newline at end of file
--- a/database_preparation/data_preparation_pipeline.py
+++ b/database_preparation/data_preparation_pipeline.py
+# -*- coding: iso-8859-1 -*-
+import os
+
+# params:
+path_to_reports = '../DataNephroTexts/reports'
+author_names = "Name1 Name2 Name3 Name4"   ## <- Type in the names of the pathologists of your institut!
+splitted_reports_folder_path = '../DataNephroTexts'
+path2corpus_bow_preprocessed_diagnosis = 'database/bow_prepro_diag.pkl'
+path2corpus_embedding_preprocessed_diagnosis = 'database/embedding_prepro_diag.pkl'
+path2corpus_bow_preprocessed_description = 'database/bow_prepro_desc.pkl'
+path2corpus_embedding_preprocessed_description = 'database/embedding_prepro_desc.pkl'
+
+# check if we are at correct working directory:
+workdir = os.getcwd()
+if not workdir[-len('nlp-in-diagnostic-texts-from-nephropathology'):] == 'nlp-in-diagnostic-texts-from-nephropathology':
+    print(workdir + " is the wrong working directory.")
+    print("please make shure to run this script with working directory '.../path/to/nlp-in-diagnostic-texts-from-nephropathology'.")
+    exit(1)
+
+preperateion_queue = [
+"python database_preparation/split_reports.py --path_to_reports " + path_to_reports + " --target_folder_path " + splitted_reports_folder_path + " --author_names \"" + author_names + '\"',
+f"python database_preparation/preprocess.py --path_to_preprocessing_params {path2corpus_bow_preprocessed_diagnosis.replace('.pkl','_meta.json')} --target_path {path2corpus_bow_preprocessed_diagnosis}",
+f"python database_preparation/preprocess.py --path_to_preprocessing_params {path2corpus_embedding_preprocessed_diagnosis.replace('.pkl','_meta.json')} --target_path {path2corpus_embedding_preprocessed_diagnosis}",
+f"python database_preparation/preprocess.py --path_to_preprocessing_params {path2corpus_bow_preprocessed_description.replace('.pkl','_meta.json')} --target_path {path2corpus_bow_preprocessed_description}",
+f"python database_preparation/preprocess.py --path_to_preprocessing_params {path2corpus_embedding_preprocessed_description.replace('.pkl','_meta.json')} --target_path {path2corpus_embedding_preprocessed_description}",
+]
+
+for script in preperateion_queue:
+    print("\n########################################### executing ###########################################")
+    print(script)
+    print("####################################################################################################\n")
+    os.system(script)
+
--- a/database_preparation/label_reports_with_authors.py
+++ b/database_preparation/label_reports_with_authors.py
+# -*- coding: iso-8859-1 -*-
+import sys, os
+import pandas as pd
+from database_preparation.utils_stringpreparation import read_german_text
+import argparse
+
+
+def amount_names(text):
+    return len(text.split('Dr.'))-1
+
+def cut_off_by_keywords(text, keywords=['Nachtragsbefund','Nachbericht']):
+    '''
+    cuts of (removes) the text-part which begins with any of the passed keyword(s)
+    and returns the new (shortened) text.
+    '''
+    for keyword in keywords:
+        if keyword in text:
+            text = text[:text.index(keyword)]
+    return text
+
+def get_names(text):
+    names = []
+    for parts in text.split('Dr. med.')[1:]:
+        tokens = parts.split(' ')
+        for token in tokens:
+            if '.' in token:
+                continue
+            if True in [c.isdigit() for c in token]:
+                continue
+            if 'Tel' in token:
+                continue
+            if token in '- war Befundverwendung f�r wissenschaftliche Zwecke oder Gutachten nur mit Genehmigung des Befunders OA PD':
+                continue
+            names.append(token)
+    return names
+
+def add_author_labels_to_df_cases(path_to_end_sections, authors, df_cases_file = "database/df_cases.pkl"):
+    df = pd.read_pickle(df_cases_file)
+    filenames = df["end_text_files"]
+    author_labels = []
+
+    print(f"\nLabeling df_cases file with authors. Searching for {authors} in {path_to_end_sections}")
+
+    for idx, filename in enumerate(filenames):
+
+        text = cut_off_by_keywords(read_german_text(path_to_end_sections + '/' + filename))
+
+        # detect authors in text
+        authors_in_text = [0 for a in range(len(authors))]
+        for j, author in enumerate(authors):
+            if author in text:
+                authors_in_text[j] = 1
+
+        # if only one author detected:
+        autor_combination_as_decimal = sum([pow(2, i) * n for i, n in enumerate(authors_in_text)])
+        if sum(authors_in_text) == 1:
+            label = authors_in_text.index(1)
+        else:
+            label = -1
+        author_labels.append(label)
+
+    df['label_author'] = author_labels
+    df.to_pickle(df_cases_file)
+
+    print("=> finished. Results:")
+    for i, author in enumerate(authors):
+        num = 0
+        for label in author_labels:
+            if label == i:
+                num += 1
+        print(author + " accured " + str(num) + " times")
+    sum_no_author = 0
+    for label in author_labels:
+        if label == -1:
+            sum_no_author = sum_no_author + 1
+    print(str(sum_no_author) + " unknown authors.")
+
+    return True
+
+def main():
+    # parse arguments:
+    sys.path.append(os.getcwd())
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--path_to_end_sections",
+                        default='../DataNephroTexts/end')
+    parser.add_argument("--author_names",
+                        default="Name1 Name2")
+    args = parser.parse_args()
+    authors = args.author_names.split(' ')
+
+    add_author_labels_to_df_cases(args.path_to_end_sections, authors)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/database_preparation/preprocess.py
+++ b/database_preparation/preprocess.py
+# -*- coding: iso-8859-1 -*-
+import sys, os
+import glob
+from tqdm import tqdm
+import pickle
+import random
+import nltk
+from HanTa import HanoverTagger as ht
+from enum import Enum
+from database_preparation.utils_stringpreparation import read_german_text
+from database_preparation.utils_wordbase import RegexpReplacer, RegexpSynonyms
+from database_preparation.stop_word_list import filter_stopwords
+import json
+import argparse
+
+
+########## define enums ##########
+
+class LemmatizeMode(Enum):
+    lemma_only_nouns = 1
+    lemma_only_nouns_adja = 2
+    lemma = 3
+    none = 4
+
+
+class PunctuationMode(Enum):
+    keep = 1
+    remove = 2
+    replace = 3
+
+
+class NumberMode(Enum):
+    keep = 1
+    remove = 2
+    replace = 3
+
+
+########## define some symbols and lists ##########
+
+num_replace_symbol = "*"
+punct_replace_symbol = "--"
+punctuations_to_remove = ['%', '=', '+', '-', '?', '<', '>', '\'', '``', '\'\'',
+                          ',', ';', '.', '*', '#', '', '\\', '/', '(', ')', '[', ']',
+                          '{', '}', '~', ':']
+do_not_lemma_list = ['igg', 'iga', 'igm']
+
+
+########## Functions ##########
+
+def prepro_params_2_string(params):
+    metadata_text = ""
+    for i, key in enumerate(params.keys()):
+        metadata_text = metadata_text + key + ': '
+        if key == 'lemma_mode':
+            metadata_text = metadata_text + str(LemmatizeMode(params[key])).replace('LemmatizeMode.', '')
+        elif key == 'punct_mode':
+            metadata_text = metadata_text + str(PunctuationMode(params[key])).replace('PunctuationMode.', '')
+        elif key == 'number_mode':
+            metadata_text = metadata_text + str(NumberMode(params[key])).replace('NumberMode.', '')
+        else:
+            metadata_text = metadata_text + str(params[key])
+        if i < len(params.keys()) - 1:
+            metadata_text = metadata_text + '\n'
+    return metadata_text
+
+
+def get_metadata(path_to_pickled_prepro_text_list):
+    try:
+        with open(path_to_pickled_prepro_text_list.replace('.pkl','_meta.json')) as json_file:
+            params = json.load(json_file)
+
+        return params
+    except:
+        return None
+
+
+def print_meta_data(path_to_pickled_prepro_text_list):
+    try:
+        params = get_metadata(path_to_pickled_prepro_text_list)
+        print(prepro_params_2_string(params))
+        print()
+        return True
+    except:
+        return False
+
+
+def is_histo_num(word):
+    if word[:2].lower() == "h/" and word[2].isdigit():
+        return True
+    return False
+
+
+def is_date(word):
+    if '.20' in word and word[0].isdigit() and word[-1].isdigit():
+        return True
+    return False
+
+def get_corpus_stats(path2corpus):
+    corpus_stats = {'total_token_count': 0, 'amount_docs': 0, 'tokens_per_doc': 0, }
+    file_list = glob.glob(path2corpus + '/*.txt')
+
+    for idx, t_file in tqdm(enumerate(file_list)):
+        t_text = read_german_text(t_file)
+        t_text = nltk.tokenize.word_tokenize(t_text, language='german')
+        corpus_stats['total_token_count'] += len(t_text)
+
+    corpus_stats['tokens_per_doc'] = float(corpus_stats['total_token_count']) / float(len(file_list))
+    corpus_stats['amount_docs'] = len(file_list)
+    return corpus_stats
+
+def preprocess(parameter_dict):
+    """
+    prepocesses a corpus, which is at source_data_path=.../path_to_corpus_folder.
+    This folder (corpus) should contain the .txt files which should be processed.
+    The .txt files should be named with name in the form <name>#<number>.txt
+    returns preprocessed_corpus as list of shape:
+    [first_preprocessed_text, second_preprocessed_text, ...]
+    Histo numbers and dates will always be removed!
+    """
+
+    source_data_path = parameter_dict['source_data']
+    do_tokenize = parameter_dict['tokenized']
+    cased = parameter_dict['cased']
+    stopword_filtered = parameter_dict['stopword_filtered']
+    use_combiner = parameter_dict['use_combiner']
+    use_replacer = parameter_dict['use_replacer']
+    lemma_mode = parameter_dict['lemma_mode']
+    punct_mode = parameter_dict['punct_mode']
+    number_mode = parameter_dict['number_mode']
+
+
+    lemma_mode = LemmatizeMode(lemma_mode)
+    punct_mode = PunctuationMode(punct_mode)
+    number_mode = NumberMode(number_mode)
+
+    combiner = RegexpSynonyms()
+    replacer = RegexpReplacer()
+    tagger = ht.HanoverTagger('morphmodel_ger.pgz')
+
+    file_list = glob.glob(source_data_path + '/*.txt')
+    file_list = sorted(file_list, key=lambda f: int(f[f.find("#") + 1:-4]))
+
+    preprocessed_corpus = []
+
+    random_example_idx = random.randrange(min(10, len(file_list)))
+
+    for idx, t_file in tqdm(enumerate(file_list)):
+
+        # %% load the txt-file
+        t_text = read_german_text(t_file)
+
+        original_text = t_text
+
+        # replace the words
+        if use_replacer:
+            t_text = replacer.replace(t_text)
+
+        # tokenize
+        t_text = nltk.tokenize.word_tokenize(t_text, language='german')
+
+        # filter stopwords
+        if stopword_filtered:
+            t_text = filter_stopwords(t_text)
+
+        # combine word pairs
+        if use_combiner:
+            t_text = combiner.combine(t_text)
+
+        # lemmatize / stemming
+        t_text = tagger.tag_sent(t_text)
+
+        # %% lemmarize the text
+        if lemma_mode == LemmatizeMode.lemma_only_nouns:
+            t_text = [lemma for (word, lemma, pos) in t_text if pos == "NN" or pos == "NE"]
+        elif lemma_mode == LemmatizeMode.lemma_only_nouns_adja:
+            t_text = [lemma for (word, lemma, pos) in t_text if pos == "NN" or pos == "NE" or pos == "ADJA"]
+        elif lemma_mode == LemmatizeMode.lemma:
+            lemmatized_text = []
+            for (word, lemma, pos) in t_text:
+                if lemma == '--' or word.lower() in do_not_lemma_list:
+                    lemmatized_text.append(word)
+                else:
+                    lemmatized_text.append(lemma)
+            t_text = lemmatized_text
+            del lemmatized_text
+        else:  # none
+            t_text = [word for (word, lemma, pos) in t_text]
+
+        # %% filter punctuation:
+        if punct_mode == PunctuationMode.remove:
+            t_text = [token for token in t_text if token not in punctuations_to_remove]
+        elif punct_mode == PunctuationMode.replace:
+            t_text = [token if token not in punctuations_to_remove else punct_replace_symbol for token in t_text]
+
+        # %% number filtering
+        filtered_text = []
+        use_single_symbol = True
+        for i, word in enumerate(t_text):
+
+            # always remove dates and histonums:
+            if is_histo_num(word) or is_date(word):
+                continue
+
+            if number_mode != NumberMode.keep:
+                if word.isdigit():  # remove dumbers
+                    if number_mode == NumberMode.replace:
+                        filtered_text.append('*' if use_single_symbol else '_zahl_')
+                        continue
+                    elif number_mode == NumberMode.remove:
+                        continue
+                elif ',' in word:  # remove "0,3"
+                    w = word.split(',')
+                    if len(w) == 2:
+                        if w[0].isdigit() and w[1].isdigit():
+                            if number_mode == NumberMode.replace:
+                                filtered_text.append('*' if use_single_symbol else 'x,y')
+                                continue
+                            elif number_mode == NumberMode.remove:
+                                continue
+                elif word[0].isdigit() and word[-1] == 'nm':  # remove distances like 500nm
+                    if number_mode == NumberMode.replace:
+                        filtered_text.append('*' if use_single_symbol else 'x_nm')
+                        continue
+                    elif number_mode == NumberMode.remove:
+                        continue
+                elif '/' in word:  # remove stuff like 6/10
+                    w = word.split('/')
+                    if len(w) == 2:
+                        if w[0].isdigit() and w[1].isdigit():
+                            if number_mode == NumberMode.replace:
+                                filtered_text.append('*' if use_single_symbol else 'x/y')
+                                continue
+                            elif number_mode == NumberMode.remove:
+                                continue
+                elif '-' in word:  # remove stuff like 5-10
+                    w = word.split('-')
+                    if len(w) == 2:
+                        if w[0].isdigit() and w[1].isdigit():
+                            if number_mode == NumberMode.replace:
+                                filtered_text.append('*' if use_single_symbol else 'x-y')
+                                continue
+                            elif number_mode == NumberMode.remove:
+                                continue
+                elif word[0].isdigit() and word[-1].lower() == 'x':  # remove 6x ('six times ...')
+                    if number_mode == NumberMode.replace:
+                        filtered_text.append('*' if use_single_symbol else 'x_mal')
+                        continue
+                    elif number_mode == NumberMode.remove:
+                        continue
+                elif word[0].isdigit() and word[-1].lower() == '%':  # remove 5_%
+                    if number_mode == NumberMode.replace:
+                        filtered_text.append('*' if use_single_symbol else 'x_%')
+                        continue
+                    elif number_mode == NumberMode.remove:
+                        continue
+                elif '+' in word:  # remmove sum expressions like "3+3+3=9/20"
+                    if number_mode == NumberMode.replace:
+                        filtered_text.append('*' if use_single_symbol else '_summe_')
+                        continue
+                    elif number_mode == NumberMode.remove:
+                        continue
+                elif word == 'cm' or word == 'mm':  # remove also distance words:
+                    if number_mode == NumberMode.replace:
+                        filtered_text.append('*' if use_single_symbol else '_distanz_')
+                        continue
+                    elif number_mode == NumberMode.remove:
+                        continue
+
+            filtered_text.append(word)
+
+        t_text = filtered_text
+        del filtered_text
+
+        # always lower the text at the end, otherwise
+        # cased sensitive operations might not work anymore!
+        if not cased:
+            t_text = [word.lower() for word in t_text]
+
+        # resample if we want it to be tokenized:
+        if not do_tokenize:
+            t_text = ' '.join(t_text)
+
+        # %% add to the list
+        preprocessed_corpus.append(t_text)
+
+        if idx == random_example_idx:
+            print("-------------- Preprocessing Example: ---------------")
+            print("Original text of " + t_file + ":")
+            print(original_text)
+            print("Processed text:")
+            print(t_text)
+            print("-----------------------------\n")
+
+    return preprocessed_corpus
+
+
+def main():
+
+    # parse arguments:
+    sys.path.append(os.getcwd())
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--path_to_preprocessing_params",
+                        default='database/bow_prepro_diag_meta.json')
+    parser.add_argument("--target_path",
+                        default='database/bow_prepro_diag.pkl')
+    args = parser.parse_args()
+
+    with open(args.path_to_preprocessing_params) as json_file:
+        params = json.load(json_file)
+
+    print(f"------ Preprocessing parameters: ------")
+    print(prepro_params_2_string(params))
+    print()
+
+    preprocessed_corpus = preprocess(params)
+
+    with open(args.target_path, 'wb') as f:
+        pickle.dump(preprocessed_corpus, f)
+
+    print(f"saved preprocessed corpus at {args.target_path}")
+
+    '''print(get_corpus_stats("../DataNephroTexts/description"))
+    print(get_corpus_stats("../DataNephroTexts/diagnosis"))
+    print(get_corpus_stats("../DataNephroTexts/end"))'''
+
+if __name__ == '__main__':
+    main()
--- a/database_preparation/reportPreparationTools.py
+++ b/database_preparation/reportPreparationTools.py
+
+from database_preparation.utils_stringpreparation import read_german_text
+import re
+
+def get3parts(t_file):
+
+    #%% load the text
+    t_text = read_german_text(t_file)
+
+    #%% define codon-find function
+    def find_codon(text, word_list):
+        codon = 'XENOTARSOSAURUS'
+        for i_word in word_list:
+            if text.find(i_word) > -1:
+                codon = i_word
+
+        return codon
+
+    #%% get start codons for description and diagnosis
+    # thinking that one pathologist sticks to his/her wording
+    start_codon_description = find_codon(t_text,
+                                         ['Lichtmikroskopie:', 'Mikroskopie:',"Histologie:"])
+
+    start_codon_2nd = find_codon(t_text,
+                                 ["Nachbericht", "Immunhistochemie"])
+
+    start_codon_conclusion = find_codon(t_text,
+                                        ["Beurteilung:", "Begutachtung:"])
+
+    start_codon_comment = find_codon(t_text,
+                                        ["Kommentar"])
+
+    if t_text.find("Mit freundlichen") > -1:
+        start_codon_greetings = "Mit freundlichen"
+    else:
+        start_codon_greetings = "Prof."
+
+    #%% set the stop codons and prepare the function
+    # like on DNA, the next start codon is a stop codon
+    stop_codon_list = [start_codon_conclusion, start_codon_description,
+                       start_codon_comment, start_codon_greetings, start_codon_2nd]
+
+    from database_preparation.utils_stringpreparation import regexp
+
+    def get_codon_idx(text, start_codon, stop_codon_list):
+
+        _, idx_start = regexp(start_codon, text)
+
+        idx_stop = []
+        for i_idx_start in idx_start:
+
+            idx_stop_list = []
+            for i_stop_codon in stop_codon_list:
+                if not i_stop_codon == start_codon:
+                    idx_stop_list.append(text[i_idx_start:].find(i_stop_codon))
+
+            idx_stop_list = [item for item in idx_stop_list if item >= 0]
+            idx_stop.append(min(idx_stop_list) + i_idx_start)
+
+        return idx_start, idx_stop
+
+    #%% find the indices for the text-frames
+    start_description, stop_description = get_codon_idx(t_text,
+                                                             start_codon_description,
+                                                             stop_codon_list)
+
+    start_2nd, stop_2nd = get_codon_idx(t_text,
+                                            start_codon_2nd,
+                                            stop_codon_list)
+
+    start_conclusion, stop_clonclusion = get_codon_idx(t_text,
+                                                             start_codon_conclusion,
+                                                             stop_codon_list)
+
+    #%% get the text parts
+    def get_text_frame(idx_start_list, idx_stop_list, text):
+        t_frame = []
+        for i in range(0, len(idx_start_list)):
+            t_frame.append(text[idx_start_list[i]:idx_stop_list[i]])
+
+        return t_frame
+
+    txt_micro = get_text_frame(start_description,stop_description, t_text)
+
+    txt_2nd = get_text_frame(start_2nd, stop_2nd, t_text)
+
+    txt_conclusion= get_text_frame(start_conclusion,stop_clonclusion, t_text)
+
+    #%% finalise the text
+    txt_conclusion = txt_conclusion[-1]
+    if not txt_2nd == []:
+        txt_micro = str(txt_micro[0]) + str(txt_2nd[0])
+    else:
+        txt_micro = str(txt_micro[0])
+
+    # get greetings-section:
+    start_greedingsindex = t_text.find("Mit freundlichen")
+    txt_greetings = None
+    if start_greedingsindex == -1:
+        start_greedingsindex = t_text.find("Prof.")
+    if start_greedingsindex != -1:
+        txt_greetings = t_text[start_greedingsindex:]
+
+    return txt_micro, txt_conclusion, txt_greetings
--- a/database_preparation/save_vectorized_texts.py
+++ b/database_preparation/save_vectorized_texts.py
+import pandas as pd
+import pickle
+import numpy as np
+import nltk
+from sklearn.feature_extraction.text import TfidfVectorizer
+from database_preparation.preprocess import print_meta_data, prepro_params_2_string
+
+# parameters:
+df_cases_file = "database/df_cases.pkl"
+
+text_corpus_paths = ['database/embedding_prepro_diag.pkl',
+                     'database/bow_prepro_diag.pkl',
+                     'database/embedding_prepro_desc.pkl',
+                     'database/bow_prepro_desc.pkl']
+vector_corpus_paths = ['database/diagnosis_texts_vectorized_DR_preprocessed.pkl',
+                       'database/diagnosis_texts_vectorized_bow_preprocessed.pkl',
+                       'database/description_texts_vectorized_DR_preprocessed.pkl',
+                       'database/description_texts_vectorized_bow_preprocessed.pkl']
+
+
+####### functions ##########
+def identity(words):
+    return words
+
+
+def get_trained_tfidf(texts):
+    vec = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
+    return vec.fit_transform(texts)
+
+
+def save_vectorized_text(text_corpus_path, vector_corpus_path):
+    with open(text_corpus_path, 'rb') as f:
+        text_lst = pickle.load(f)
+
+    text1 = np.asarray(text_lst[0])
+    text_lst_is_tokenized = bool(text1.ndim)
+    if not text_lst_is_tokenized:
+        tokenized_texts = []
+        for t_text in text_lst:
+            tokenized_texts.append(nltk.tokenize.word_tokenize(t_text, language='german'))
+        text_lst = tokenized_texts
+        del tokenized_texts
+
+    vectorized_text = get_trained_tfidf(text_lst)
+    with open(vector_corpus_path, 'wb') as f:
+        pickle.dump(vectorized_text, f)
+    print(f"saved {vector_corpus_path}")
+
+    # save metadata:
+    '''with open(text_corpus_path.replace('.pkl', '_meta.pkl'), 'rb') as f:
+        params = pickle.load(f)
+    metadata_text = prepro_params_2_string(params)
+    with open(vector_corpus_path.replace('.pkl', '_meta.txt'), 'w') as f:
+        f.write(metadata_text)'''
+
+
+def main():
+
+    for i, text_corpus_path in enumerate(text_corpus_paths):
+        save_vectorized_text(text_corpus_path, vector_corpus_paths[i])
+
+
+if __name__ == '__main__':
+    main()
--- a/database_preparation/split_reports.py
+++ b/database_preparation/split_reports.py
+
+'''
+this script splits the reports (.txt files) located in the
+--path_to_reports folder into description-, diagnosis- and end-section
+and saves these parts in --target_folder_path (as -txt files).
+
+In addition, a pandas dataframe (--df_cases_file) is then generated in
+which it is saved which 3 report-sections belong together. The df_cases
+dataframe is also there to label the reports with different labelsets.
+Also pass --author_names (space separated names, cased) to label the reports by found
+authors (stored in df_cases)
+'''
+
+import sys
+import glob
+from tqdm import tqdm
+import pandas as pd
+from database_preparation.reportPreparationTools import get3parts
+import os, shutil
+import argparse
+from database_preparation.label_reports_with_authors import add_author_labels_to_df_cases
+sys.path.append(os.getcwd())
+
+# parse arguments:
+sys.path.append(os.getcwd())
+parser = argparse.ArgumentParser()
+parser.add_argument("--path_to_reports",
+                    default='../DataNephroTexts/reports')
+parser.add_argument("--target_folder_path",
+                    default='../DataNephroTexts')
+parser.add_argument("--df_cases_file",
+                    default='database/df_cases.pkl')
+parser.add_argument("--author_names",
+                        default="Name1 Name2")
+parser.add_argument("--text_encoding",
+                        default="latin-1")
+parser.add_argument("--use_newest_reports", action='store_true')
+args = parser.parse_args()
+
+# %% get all files
+# get the primary reports, which histonums are ending wit .0:
+report_file_list = glob.glob(args.path_to_reports + '/*0.txt')
+
+# get all last reports (newest ones)
+if args.use_newest_reports:
+    print("using newest reports (if there is a H.123.0 and a H.123.2, take H.123.2)")
+    for idx in tqdm(range(0, len(report_file_list))):
+        t_file_name = report_file_list[idx]
+        report_file_list[idx] = glob.glob(t_file_name[0:-5] + "*.txt")[-1]
+else:
+    print("using oldes reports (if there is a H.123.0 and a H.123.2, take H.123.0)")
+
+
+
+def make_folder(dir):
+    shutil.rmtree(dir, ignore_errors=True)
+    os.makedirs(dir)
+
+
+make_folder(args.target_folder_path + "/description")
+make_folder(args.target_folder_path + "/diagnosis")
+make_folder(args.target_folder_path + "/end")
+make_folder(args.target_folder_path + "/short_diagnosis")
+
+print(f"Splitting reports of corpus {args.path_to_reports} into description-, diagnosis- and end-sections...")
+
+# %% iterate over all files
+error_file_list = []
+no_error_file_list = []
+lst_description, lst_diagnose, lst_end = [], [], []
+for idx, t_file in tqdm(enumerate(report_file_list)):
+
+    try:
+        txt_micro, txt_conclusion, end = get3parts(t_file)
+        no_error_file_list.append(t_file)
+
+    except:
+        #print(f"skipped file {t_file}. Could not split the text into description and diagnosis-part")
+        error_file_list.append(t_file)
+        continue
+
+    with open(args.target_folder_path + "/description/description#" + str(idx) + ".txt", "w",
+              encoding=args.text_encoding) as text_file:
+        text_file.write(txt_micro)
+
+    with open(args.target_folder_path + "/diagnosis/diagnosis#" + str(idx) + ".txt", "w",
+              encoding=args.text_encoding) as text_file:
+        text_file.write(txt_conclusion)
+
+    with open(args.target_folder_path + "/end/end#" + str(idx) + ".txt", "w",
+              encoding=args.text_encoding) as text_file:
+        if end == None:
+            end = "None"
+        text_file.write(end)
+
+    lst_description.append('description#' + str(idx) + ".txt")
+    lst_diagnose.append('diagnosis#' + str(idx) + ".txt")
+    lst_end.append('end#' + str(idx) + ".txt")
+
+#save skipped reports:
+with open(args.target_folder_path + "/failed_to_split_list.txt", "w") as text_file:
+    text_file.write('\n'.join(error_file_list))
+
+# print infos:
+print(" ===> finished < === ")
+print(f"skipped {len(error_file_list)} reports "
+      f"({round(len(error_file_list)/len(report_file_list)*100,1)}%), since text splitting "
+      f"failed (see {args.target_folder_path + '/failed_to_split_list.txt'}).")
+processed_docs = len(lst_description)
+
+print(f"saved {processed_docs} description sections at {args.target_folder_path + '/description'}")
+print(f"saved {processed_docs} diagnosis sections at {args.target_folder_path + '/diagnosis'}")
+print(f"saved {processed_docs} end sections at {args.target_folder_path + '/end'}")
+
+# create and save df_cases file, which is also used to save the labels (=cluster-indices) for each text.
+df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end)),
+                  columns=['description_text_files', 'diagnosis_text_files', 'end_text_files'])
+df.to_pickle(args.df_cases_file)
+
+# search for authors in end-sections in order to add them as labels to the df_cases file:
+try:
+    add_author_labels_to_df_cases(args.target_folder_path + '/end', args.author_names.split(' '), args.df_cases_file)
+except:
+    print("label the reports with authors failed.")
+
+df = pd.read_pickle(args.df_cases_file)
+print(f"saved df_cases at {args.df_cases_file}\n")
+print()
--- a/database_preparation/stop_word_list.py
+++ b/database_preparation/stop_word_list.py
+import nltk
+from database_preparation.utils_wordbase import RegexpReplacer
+
+lst_stopwords_patho = ['Ca.', 'Ca', 'ca.', 'ca' "Circa", "circa"]
+#there are a view words in the german nltk.coprus.stopwords-list which we might keep!
+lst_stopwords_to_keep = ['keiner', 'keinen', 'keines', 'keinem', 'keine', 'kein']
+
+def filter_stopwords(tokinzed_txt, additional_stop_words = None):
+    '''
+    - passed text list has to be tokenized!
+    '''
+    stop_words = nltk.corpus.stopwords.words('german')
+    stop_words.extend(lst_stopwords_patho)
+    if type(additional_stop_words) == list:
+        stop_words.extend(additional_stop_words)
+    replacer = RegexpReplacer()
+    for word in lst_stopwords_to_keep:
+        stop_words.remove(word)
+    new_stopwords = []
+    for word in stop_words:  # add replaced words to stoplist:
+        new_word = replacer.replace(word)
+        if word != new_word:
+            new_stopwords.append(new_word)
+    stop_words.extend(new_stopwords)
+
+    return [w for w in tokinzed_txt if not w.lower() in stop_words]
\ No newline at end of file
--- a/database_preparation/utils_labeled_datasets.py
+++ b/database_preparation/utils_labeled_datasets.py
+import math
+
+import pandas as pd
+import os
+import sys
+
+sys.path.append(os.getcwd())
+import datasets
+import pyarrow as pa
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.model_selection import KFold
+from sklearn.model_selection import StratifiedKFold
+import pickle
+import scipy
+import random
+from tqdm import tqdm
+
+path2textfiles = "../DataNephroTexts/input/"
+path2diagnosefiles = "../DataNephroTexts/label/"
+
+
+def is_text_lst_tokenized(path2corpus):
+    try:
+        text_lst = pd.read_pickle(path2corpus)
+        text1 = np.asarray(text_lst[0])
+        return bool(text1.ndim)
+    except:
+        return False
+
+
+def is_text_lst_tfidf_vectorized(path2corpus):
+    try:
+        with open(path2corpus, 'rb') as f:
+            loaded_texts = pickle.load(f)
+        return type(loaded_texts) == scipy.sparse.csr.csr_matrix
+    except:
+        return False
+
+
+def text_label_2_labeled_dataset(texts, unfiltered_labels, print_infos=False):
+    '''
+        - sorts out outliear-documents (which belongs to cluster .1 or cluster Nonde)
+        - converts the passed text-label pair to datastes.Dataset type.
+        - returns dataset in format: {"text": labeled_texts, "label": labels}
+    '''
+    # collect all text-label pairs, skipping unvalid labels
+    labeled_texts = []
+    labels = []
+    skipped_labels = 0
+
+    # throw out invalid labels:
+    for i, l in enumerate(unfiltered_labels):
+        try:
+            label = int(l)
+            if label < 0:
+                skipped_labels += 1
+                continue
+        except:
+            skipped_labels += 1
+            continue
+        labels.append(label)
+        labeled_texts.append(texts[i])
+
+    if print_infos:
+        print("skipped " + str(skipped_labels))
+
+    labels = label_list_as_int_list(labels)
+
+    # convert it to a hf_dataset, that we can use our tools:
+    df = pd.DataFrame({"text": labeled_texts, "label": labels})
+    return datasets.Dataset(pa.Table.from_pandas(df))
+
+
+def text_label_files_to_labeled_dataset(label_set,
+                                        path2corpus="./database/bow_prepro_desc.pkl",
+                                        df_cases_path="./database/df_cases.pkl", print_infos=False):
+    '''
+    - sorts out outliear-documents (which belongs to cluster .1 or cluster Nonde)
+    - converts the pandas dataframe to datastes.Dataset type.
+    '''
+
+    df_cases = pd.read_pickle(df_cases_path)
+    texts = pd.read_pickle(path2corpus)
+    unfiltered_labels = df_cases["label_" + label_set]
+
+    return text_label_2_labeled_dataset(texts, unfiltered_labels, print_infos)
+
+
+def get_all_label_set_ids():
+    df = pd.read_pickle("./database/df_cases.pkl")
+    return [e[6:] for e in df.columns if "label_" in e]
+
+
+def get_filename_label_tuple(label_set, get_micro_txt=True, df_cases_file="./database/df_cases.pkl"):
+    '''
+    returns textfilename_list, label_lists as ([filenames],[labels, as int list]))
+    it will contain outlier labels (they have value None or -1)
+    '''
+    df_cases = pd.read_pickle(df_cases_file)
+    if "label_" + label_set not in df_cases.columns:
+        raise ValueError("label set " + label_set + " does not exist in df_cases!")
+        return None
+    # convert labels to integers:
+    int_labels = label_list_as_int_list(df_cases["label_" + label_set])
+    if get_micro_txt:
+        return df_cases["description_text_files"], int_labels
+    else:
+        return df_cases["diagnosis_text_files"], int_labels
+
+
+def get_amount_unique_labels(label_set, df_cases_file="./database/df_cases.pkl"):
+    '''
+    returns amount unique labels (does not count nan or -1 classes!!!).
+    If label_set does not exist, you will get
+    an error. If so, run generate_save_hf_dataset(...) to generate a labeled dataset
+    of type datasets.Dataset  (datasets is a library from huggingface)
+    '''
+    df_cases = pd.read_pickle(df_cases_file)
+    if "label_" + label_set not in df_cases.columns:
+        raise ValueError("label set " + label_set + " does not exist in df_cases!")
+        return None
+    # convert labels to integers:
+    labels = label_list_as_int_list(df_cases["label_" + label_set])
+    has_none_labels = False
+    for label in labels:
+        if label == -1 or np.isnan(label) or label == None:
+            has_none_labels = True
+            return len(list(set(labels))) - 1
+
+    return len(list(set(labels)))
+
+
+def get_amount_reports(label_set):
+    '''
+    returns amount of reports which have a valid label (excluding -1 and NaN values)
+    '''
+    # train_test_dataset = load_labeled_dataset(label_set)
+    # return len(train_test_dataset["label"])
+    text, labels = get_filename_label_tuple(label_set)
+    return len([l for l in labels if l >= 0])
+
+
+def generate_save_hf_dataset(label_set="LDA", overwrite=True, lower=False):
+    '''
+    Generate a labeled dataset of type datasets.Dataset
+    (datasets is a library from huggingface)
+    and saves it under "./database/labeled_dataframes/labeld_dataset_" + label_set
+    '''
+
+    dataset_path = "./database/labeled_dataframes/labeld_dataset_" + label_set
+
+    if os.path.exists(dataset_path):
+        print(dataset_path + " already exists.")
+        if overwrite:
+            print("generating it new and overwrite " + dataset_path)
+        else:
+            print("skipping generation of " + dataset_path)
+            return
+
+    df_cases = pd.read_pickle("./database/df_cases.pkl")
+    # print(df_cases.columns)
+
+    # collect all text-label pairs, skipping unvalid labels!
+    diag_text_rokenized = pd.read_pickle("./database/diag_lst_tokenized.pkl")
+    texts = []
+    labels = []
+    diagnoses = []
+    skipped_labels = 0
+
+    # throw out invalid labels:
+    print("creating " + dataset_path)
+    for i, l in enumerate(df_cases["label_" + label_set]):
+        try:
+            label = int(l)
+            if label < 0:
+                skipped_labels += 1
+                continue
+        except:
+            skipped_labels += 1
+            continue
+        labels.append(label)
+
+        file_id = df_cases["description_text_files"][i]
+        with open(path2textfiles + file_id, 'r') as f:
+            if lower:
+                texts.append(f.read().lower())
+            else:
+                texts.append(f.read())
+
+        file_id = df_cases["diagnosis_text_files"][i]
+        with open(path2diagnosefiles + file_id, 'r') as f:
+            if lower:
+                diagnoses.append(f.read().lower())
+            else:
+                diagnoses.append(f.read())
+
+    print("skipped " + str(skipped_labels) + " labels")
+
+    # convert to dataframe:
+    df = pd.DataFrame({
+        'text': texts,
+        'label': labels,
+        'diagnose': diagnoses
+    })
+
+    # convert pandas dataframe to huggingface dataset:
+    hf_dataset = datasets.Dataset(pa.Table.from_pandas(df))
+
+    '''# how to create a DatasetDict:
+    test_split_length = 100
+    hf_data_dict = datasets.DatasetDict({"train": datasets.Dataset(pa.Table.from_pandas(df[test_split_length:])),
+                                         "test": datasets.Dataset(pa.Table.from_pandas(df[:test_split_length])),
+                                         "unsupervised": hf_dataset})
+    hf_data_dict.save_to_disk(dataset_path)'''
+
+    # print("shape of " + dataset_path + ":")
+    # print(hf_dataset)
+    hf_dataset.save_to_disk(dataset_path)
+
+
+def label_list_as_int_list(labels):
+    '''
+    converts a label list to a list of integers,
+    regardles if its a list of floats or strings
+    '''
+    int_labels = []
+    for i, l in enumerate(labels):
+        try:
+            int_labels.append(int(labels[i]))
+        except:
+            int_labels.append(-1)
+    return int_labels
+
+
+def get_splits_for_cross_val(dataset, fold_amount=10, stratified=True,
+                             merge_classes=None, oversample=False):
+    '''
+    dataset should be sth which can be accessed via dataset['text'] and dataset['label']
+    returns splits for k-fold-cross-validation as datasets.Dataset type
+    with sth like merge_classes=[(0,1),(2,3,4)] you can merge the indexted classes to one class
+    '''
+
+    if merge_classes is not None:
+        new_labels = [i for i in dataset['label']]
+        for classes_to_merge in merge_classes:
+            new_class_name = classes_to_merge[0]
+            for c in classes_to_merge[1:]:
+                for i, label in enumerate(dataset['label']):
+                    if int(label) == int(c):
+                        new_labels[i] = new_class_name
+
+        dataset = text_label_2_labeled_dataset(dataset['text'], new_labels)
+
+    if oversample:
+        dataset = simple_oversampling(dataset)
+
+    if stratified:
+        skf = StratifiedKFold(n_splits=fold_amount, random_state=None, shuffle=False)
+        for train_index, test_index in skf.split(dataset['text'], dataset['label']):
+            yield dataset[train_index], dataset[test_index]
+    else:
+        folds = KFold(n_splits=fold_amount, shuffle=False)
+        for train_index, test_index in folds.split(list(range(len(dataset)))):
+            yield dataset[train_index], dataset[test_index]
+
+def simple_oversampling(dataset):
+    print("oversampling (without augmentation!)...")
+    unique_labels = np.unique(dataset['label'])
+    label_amount = [0 for x in range(len(unique_labels))]
+    texts = []
+    for i, l in enumerate(tqdm(unique_labels)):
+        i_th_labels = dataset['label'] == l
+        label_amount[i] = int(np.sum(i_th_labels))
+        texts.append([dataset['text'][i] for i,label in enumerate(dataset['label']) if label == l])
+    max_index = label_amount.index(max(label_amount))
+    for i, l in enumerate(tqdm(unique_labels)):
+        if i == max_index:
+            continue
+        amount_copies = label_amount[max_index] - label_amount[i]
+        for x in range(amount_copies):
+            new_element = {'label': l, 'text': random.choice(texts[i])}
+            dataset = dataset.add_item(new_element)
+    return dataset
+
+def main():
+    # args = argsparse_preamble()
+    # generate_save_hf_dataset(args.clustered_data)
+
+    # print label sets
+    label_sets = get_all_label_set_ids()
+    print(label_sets)
+
+    # dirty fix of OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
+    import os
+    os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+    # plot histograms: how much docs do have the same label=cluster-index?
+    for i, label_set in enumerate(label_sets):
+        text, labels = get_filename_label_tuple(label_set)
+        labels = np.asarray(label_list_as_int_list(labels))
+
+        # plt.subplot(3, 3, i + 1)
+        plt.close()
+        label_num = get_amount_unique_labels(label_set)
+        x = np.arange(label_num)
+        h, b = np.histogram(labels, bins=label_num)
+        plt.bar(x, height=h)
+        plt.xticks(x, x)
+        plt.title(label_set)
+        plt.savefig("TextClustering/plots/" + label_set + "_histogram.png")
+
+
+if __name__ == '__main__':
+    main()
--- a/database_preparation/utils_stringpreparation.py
+++ b/database_preparation/utils_stringpreparation.py
+import codecs
+import nltk
+from nltk.probability import FreqDist
+
+def read_german_text(filename):
+
+    textfile = codecs.open(filename, 'br', "latin-1")
+    text= textfile.read()
+    textfile.close()
+
+    return text
+
+#%%
+def get_most_frequent_words(text, n_words = 10):
+
+    from nltk.probability import FreqDist
+    tokenizer = nltk.RegexpTokenizer(r"\w+")
+    word_list = tokenizer.tokenize(text)
+    freq_dist = FreqDist(word_list)
+    top_words = freq_dist.most_common(n_words)
+    top_words = [word[0] for word in top_words]
+
+    return top_words
+
+#%%
+def get_most_discriminant_words(text, text_vectorizer, text_counterizer, n_words = 10):
+
+    #%% get the vector
+    if not text:
+        top_words = []
+        return top_words
+
+    tf_idf_vector = text_vectorizer.transform(text_counterizer.transform(text))
+
+    #%% define the coo-function
+    def sort_coo(coo_matrix):
+        tuples = zip(coo_matrix.col, coo_matrix.data)
+        tuples = set(tuples)
+        tuples = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
+        return tuples
+
+    #%% define the sort function
+    def extract_topn_from_vector(feature_names, sorted_items, n_words):
+        """get the feature names and tf-idf score of top n items"""
+
+        # use only topn items from vector
+        sorted_items = sorted_items[:n_words]
+
+        score_vals = []
+        feature_vals = []
+
+        # word index and corresponding tf-idf score
+        for idx, score in sorted_items:
+            # keep track of feature name and its corresponding score
+            score_vals.append(round(score, 3))
+            feature_vals.append(feature_names[idx])
+
+        # create a tuples of feature,score
+        # results = zip(feature_vals,score_vals)
+        results = {}
+        for idx in range(len(feature_vals)):
+            results[feature_vals[idx]] = score_vals[idx]
+
+        return results
+
+    #%% sort the results
+    sorted_items = sort_coo(tf_idf_vector.tocoo())
+
+    # extract only the top n; n here is 10
+    feature_names = text_counterizer.get_feature_names()
+    keywords = extract_topn_from_vector(feature_names, sorted_items, n_words)
+    top_words = list(keywords.keys())
+
+    return top_words
+
+#%%
+def regexp(pattern, text):
+    import re
+    index_start, index_stop = [], []
+    value = []
+    for match in re.finditer(pattern, text):
+        index_start.append(match.start())
+        index_stop.append(match.end())
+        value.append(match.group())
+
+    return index_start, index_stop
\ No newline at end of file
--- a/database_preparation/utils_wordbase.py
+++ b/database_preparation/utils_wordbase.py
+
+#%% replace words
+import re
+replacement_patterns = [
+    ('ittelgradig', 'äßiggradig'),
+    ('ittelschwer', 'äßiggradig'),
+    ('Tubulusepithelschädigung', 'Tubulusepithelschaden'),
+    ('prärenalen', 'prärenale'),
+    ('reversibler', 'reversibel'),
+    ('max.', 'maximal'),
+    ('min.', 'minimal')
+]
+''' ('ae', 'ä'),
+    ('oe', 'ö'),
+    ('ue', 'ü'),
+    ('Ae', 'Ä'),
+    ('Oe', 'Ö'),
+    ('Ue', 'Ü'),'''
+
+class RegexpReplacer(object):
+    def __init__(self, patterns=replacement_patterns):
+        self.patterns = [(re.compile(regex), repl) for (regex, repl) in
+        patterns]
+    def replace(self, text):
+        s = text
+        for (pattern, repl) in self.patterns:
+            (s, count) = re.subn(pattern, repl, s)
+        return s
+
+#%% combine words
+from nltk.tokenize import MWETokenizer
+combinedword_patterns = [
+    ('tubulointerstitieller', 'Schaden'),
+     ('prärenalen', 'Genese'),
+    ('prärenale', 'Genese'),
+    ('potentiell', 'reversibler'),
+    ('potentiell', 'reversibel'),
+    ('akuter', 'Tubulusepithelschaden'),
+    ('Lupus', 'Nephritis'),
+    ('diabetische', 'Nephropathie'),
+    ('diabetische', 'Glomerulosklerose'),
+    ('0', '%'),
+    ('1', '%'),
+    ('2', '%'),
+    ('3', '%'),
+    ('4', '%'),
+    ('5', '%'),
+    ('6', '%'),
+    ('7', '%'),
+    ('8', '%'),
+    ('9', '%'),
+    ('tubulointerstitielle', 'Schaedigung'),
+    ('segmentale', 'Glomerulosklerose'),
+    ('globale', 'Glomerulosklerose'),
+    ('potentiell', 'reversible'),
+    ('reversible', 'Tubulusepithelschaden'),
+    ('fokal', 'globale'),
+    ('fokal', 'segmentale'),
+    ('interstitielle', 'Nephritis'),
+    ('proliferierende', 'Glomerulonephritis'),
+    ('segmental', 'nekrotisierende')
+]
+
+class RegexpSynonyms(object):
+    def __init__(self):
+        self.patterns = combinedword_patterns
+
+    def combine(self, text):
+
+        mwe_tokenizer = MWETokenizer(self.patterns)
+        s = mwe_tokenizer.tokenize(text)
+
+        return s
--- a/utils_general.py
+++ b/utils_general.py
+from sys import platform
+from os.path import expanduser
+from googletrans import Translator  # use pip install googletrans==3.1.0a0, 3.0 version is broken
+import os
+
+# some words which are usually gets translated wrong (from ger to eng):
+custom_translation = {'klasse': 'class', 'nih': 'nih', 'leicht': 'minor', 'miterfasst': 'registered',
+                                  'gesamtzahl': 'total amount', 'hinweis': 'hint', 'unauffällig': 'unremarkable',
+                                  'weitgehend': 'mainly', 'leichtgradiger': 'mild', 'mäßiggradiger': 'a moderate',
+                                  '-nih': '-nih', 'bekannt': 'known', 'anschließend': 'followed_by',
+                                  'vorbehaltlich': 'for_now', 'mittels': 'using', 'teils': 'partly'}
+
+def make_directory(path):
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+
+def translate_to(text, src='de', to='en'):
+    translator = Translator()
+    return translator.translate(text, src=src, dest=to).text
+
+def main():
+    pass
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file