Commit d5224af3 authored by Maximilian Legnar's avatar Maximilian Legnar

added first version

parent a6bfb3de
def cluster_entropy(df):
#%% import
from CorpusHomogeneity.text_entropy import corpus_entropy
import numpy as np
#%%
cluster_ids = np.unique(df.cluster)
ent_mean, ent_std = [None] * len(cluster_ids), [None] * len(cluster_ids)
for idx, i_cluster in enumerate(cluster_ids):
if i_cluster == -1: # important to ignore non-clusters texts
ent_mean[idx] = np.nan
ent_std[idx] = np.nan
else:
t_corpus = df[df['cluster'] == i_cluster]
t_corpus = t_corpus.text.tolist()
ent_mean[idx], ent_std[idx] = corpus_entropy(t_corpus)
#%% output
ent_mean = np.nanmean(ent_mean)
ent_std = np.nanstd(ent_mean)
return ent_mean, ent_std
\ No newline at end of file
#%% tokenize text
def tokenize_corpus(corpus):
#%% imports
import nltk
import string
from nltk.tokenize import word_tokenize
from HanTa import HanoverTagger as ht
stop_words = nltk.corpus.stopwords.words('german')
tagger = ht.HanoverTagger('morphmodel_ger.pgz')
from tqdm import tqdm
# %% read the files to a list
corpus_tokenized = corpus
for idx, t_text in tqdm(enumerate(corpus_tokenized)):
#%% get the words from the text
t_text = str(t_text)
tokens = word_tokenize(t_text, language='german')
tokens = list(filter(lambda token: token not in string.punctuation, tokens))
#%% get only the nouns
nouns = tagger.tag_sent(tokens)
nouns = [lemma for (word, lemma, pos) in nouns if pos == "NN" or pos == "NE"]
#%% mount it back
corpus_tokenized[idx] = nouns
#%% output-layer
return corpus_tokenized
#%% calculate the entropy
def corpus_entropy(corpus):
'''
how much differ the docs, compared to the whole corpus?
'''
#%% input layer
#corpus_tokenized = tokenize_corpus(corpus)
import numpy as np
text1 = np.asarray(corpus[0])
is_tokenized = bool(text1.ndim)
if is_tokenized:
corpus_not_tokenized = [" ".join(i_text) for i_text in corpus]
else:
corpus_not_tokenized = corpus
# corpus_not_tokenized = [nltk.tokenize.word_tokenize(i_text, language='german') for i_text in corpus]
#%% count the word-occurences
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
vectorizer = CountVectorizer()
try:
X = vectorizer.fit_transform(corpus_not_tokenized)
except:
return np.nan, np.nan
df = X.toarray()
#%% calculate the entropy
from scipy.stats import entropy
import numpy as np
corspus_tf = sum(df)
corpus_mean = np.mean(df,0)
ent_values = []
for i in range(0, df.shape[0]):
document_tf = df[i, :]
a = entropy(document_tf, qk=corspus_tf) #
ent_values.append(a)
#%% output-layer
entropy_mean = np.nanmean(ent_values)
entropy_std = np.nanstd(ent_values)
#%%
return entropy_mean, entropy_std
if __name__ == '__main__':
corpus_entropy()
\ No newline at end of file
# -*- coding: iso-8859-1 -*-
import random
import os, sys
from os import listdir
from os.path import isfile, join
import pyarrow as pa
import pandas as pd
import datasets
from database_preparation.utils_stringpreparation import read_german_text
import argparse
# parse arguments:
sys.path.append(os.getcwd())
parser = argparse.ArgumentParser()
parser.add_argument("--path_to_reports",
default='../DataNephroTexts/reports')
parser.add_argument("--output_path",
default='./LanguageModelling/hf_nephro_set_1')
parser.add_argument("--percentage_train_amount", type=float, default=0.9)
args = parser.parse_args()
def save_as_hf_dataset(datapath, file_id_list, output_path):
report_texts = []
for id in file_id_list:
text = (read_german_text(datapath + str("/") + str(id)))
report_texts.append(text)
df = pd.DataFrame({
'text': report_texts
})
mytable = pa.Table.from_pandas(df)
my_dataset = datasets.Dataset(mytable)
#my_dataset.save_to_disk(output_path)
my_dataset.to_json(output_path + ".json")
print(f"Generated {output_path}")
def main():
print("processing " + args.path_to_reports)
reports = [f for f in listdir(args.path_to_reports) if isfile(join(args.path_to_reports, f))]
reps0 = [r for r in reports if r[-5] == '0']
random.shuffle(reps0)
last_index = len(reps0) - 1
until = int(args.percentage_train_amount * last_index)
if args.percentage_train_amount < 1:
train = [e for i, e in enumerate(reps0) if i <= until]
val = [e for i, e in enumerate(reps0) if i > until]
save_as_hf_dataset(args.path_to_reports, train, args.output_path + "_train")
save_as_hf_dataset(args.path_to_reports, val, args.output_path + "_validation")
else:
save_as_hf_dataset(args.path_to_reports, reps0, args.output_path)
# how to load dataset:
'''ds = datasets.load_from_disk("./LanguageModelling/path2set")
print(ds)'''
return 0
if __name__ == "__main__":
main()
# -*- coding: iso-8859-1 -*-
'''
This script is based on:
https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling
The passed training text must be of type "datasets.Dataset"
You can use generate_dataset_for_mlm.py to convert a folder with .txt files into a huggingface Dataset.
Use argument --no_cuda if its not working with cuda.
If gpu memory to small: Reduce --per_device_train_batch_size.
Program arguments mainly same as we can find in
transformers.TrainingArguments:
https://huggingface.co/docs/transformers/v4.17.0/en/main_classes/trainer#transformers.TrainingArguments
Good explanations and tips:
https://colab.research.google.com/github/gmihaila/ml_things/blob/master/notebooks/pytorch/pretrain_transformers_pytorch.ipynb#scrollTo=E1F-XIQCdOgj
'''
# example arguments for loss curves analysis:
'''
--model_name_or_path bert-base-german-cased
--train_file ./LanguageModelling/hf_nephro_set_3_train.json
--validation_file ./LanguageModelling/hf_nephro_set_3_validation.json
--output_dir ./LanguageModelling/ger-patho-bert-v3
--do_train
--do_eval
--overwrite_output_dir
--num_train_epochs 10
--evaluation_strategy steps
--logging_steps 30
--whole_word_mask True
--per_device_train_batch_size=8
'''
# example arguments for final training:
'''
--model_name_or_path bert-base-german-cased
--train_file ./LanguageModelling/hf_nephro_set_3.json
--output_dir ./LanguageModelling/ger-patho-bert-v3
--do_train
--overwrite_output_dir
--num_train_epochs 1
--whole_word_mask True
--per_device_train_batch_size=8
'''
# Copyright 2020 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) on a text file or a dataset.
Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
https://huggingface.co/models?filter=masked-lm
"""
# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
import logging
import math
import os
import sys
from dataclasses import dataclass, field
from typing import Optional
import datasets
from datasets import load_dataset
import transformers
from transformers import (
CONFIG_MAPPING,
MODEL_FOR_MASKED_LM_MAPPING,
AutoConfig,
AutoModelForMaskedLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
DataCollatorForWholeWordMask,
HfArgumentParser,
Trainer,
TrainingArguments,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.13.0.dev0")
# use pip install git+https://github.com/huggingface/transformers to install 4.13 from source!
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
"""
model_name_or_path: Optional[str] = field(
default=None,
metadata={
"help": "The model checkpoint for weights initialization."
"Don't set if you want to train a model from scratch."
},
)
model_type: Optional[str] = field(
default=None,
metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
)
config_overrides: Optional[str] = field(
default=None,
metadata={
"help": "Override some existing default config settings when a model is trained from scratch. Example: "
"n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
},
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
use_fast_tokenizer: bool = field(
default=True,
metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
)
model_revision: str = field(
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
use_auth_token: bool = field(
default=False,
metadata={
"help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
"with private models)."
},
)
def __post_init__(self):
if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
raise ValueError(
"--config_overrides can't be used in combination with --config_name or --model_name_or_path"
)
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what database_preparation we are going to input our model for training and eval.
"""
dataset_name: Optional[str] = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
train_file: Optional[str] = field(default=None, metadata={"help": "The input training database_preparation file (a text file)."})
validation_file: Optional[str] = field(
default=None,
metadata={"help": "An optional input evaluation database_preparation file to evaluate the perplexity on (a text file)."},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
validation_split_percentage: Optional[int] = field(
default=5,
metadata={
"help": "The percentage of the train set used as validation set in case there's no validation split"
},
)
max_seq_length: Optional[int] = field(
default=None,
metadata={
"help": "The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated."
},
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
mlm_probability: float = field(
default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
)
line_by_line: bool = field(
default=False,
metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
)
pad_to_max_length: bool = field(
default=False,
metadata={
"help": "Whether to pad all samples to `max_seq_length`. "
"If False, will pad the samples dynamically when batching to the maximum length in the batch."
},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
"value if set."
},
)
whole_word_mask: bool = field(
default=False,
metadata={
"help": "Wether to use whole-word-masking, defaults False. if False: Use subword-masking"
},
)
def __post_init__(self):
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
raise ValueError("Need either a dataset name or a training/validation file.")
else:
if self.train_file is not None:
extension = self.train_file.split(".")[-1]
if extension not in ["csv", "json", "txt"]:
raise ValueError("`train_file` should be a csv, a json or a txt file.")
if self.validation_file is not None:
extension = self.validation_file.split(".")[-1]
if extension not in ["csv", "json", "txt"]:
raise ValueError("`validation_file` should be a csv, a json or a txt file.")
def main():
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
# Set the verbosity to info of the Transformers logger (on main process only):
logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model.
set_seed(training_args.seed)
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub
#
# For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this
# behavior (see below)
#
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir,
)
raw_datasets["train"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=f"train[{data_args.validation_split_percentage}%:]",
cache_dir=model_args.cache_dir,
)
else:
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.validation_file.split(".")[-1]
if extension == "txt":
extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# If no validation database_preparation is there, validation_split_percentage will be used to divide the dataset.
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
extension,
data_files=data_files,
split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir,
)
raw_datasets["train"] = load_dataset(
extension,
data_files=data_files,
split=f"train[{data_args.validation_split_percentage}%:]",
cache_dir=model_args.cache_dir,
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config_kwargs = {
"cache_dir": model_args.cache_dir,
"revision": model_args.model_revision,
"use_auth_token": True if model_args.use_auth_token else None,
}
if model_args.config_name:
config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
elif model_args.model_name_or_path:
config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
else:
config = CONFIG_MAPPING[model_args.model_type]()
logger.warning("You are instantiating a new config instance from scratch.")
if model_args.config_overrides is not None:
logger.info(f"Overriding config: {model_args.config_overrides}")
config.update_from_string(model_args.config_overrides)
tokenizer_kwargs = {
"cache_dir": model_args.cache_dir,
"use_fast": model_args.use_fast_tokenizer,
"revision": model_args.model_revision,
"use_auth_token": True if model_args.use_auth_token else None,
}
if model_args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
elif model_args.model_name_or_path:
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)
if model_args.model_name_or_path:
model = AutoModelForMaskedLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
else:
logger.info("Training new model from scratch")
model = AutoModelForMaskedLM.from_config(config)
model.resize_token_embeddings(len(tokenizer))
# Preprocessing the datasets.
# First we tokenize all the texts.
if training_args.do_train:
column_names = raw_datasets["train"].column_names
else:
column_names = raw_datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
if data_args.max_seq_length is None:
max_seq_length = tokenizer.model_max_length
if max_seq_length > 1024:
logger.warning(
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
"Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
)
max_seq_length = 1024
else:
if data_args.max_seq_length > tokenizer.model_max_length:
logger.warning(
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
if data_args.line_by_line:
# When using line_by_line, we just tokenize each nonempty line.
padding = "max_length" if data_args.pad_to_max_length else False
def tokenize_function(examples):
# Remove empty lines
examples[text_column_name] = [
line for line in examples[text_column_name] if len(line) > 0 and not line.isspace()
]
return tokenizer(
examples[text_column_name],
padding=padding,
truncation=True,
max_length=max_seq_length,
# We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
# receives the `special_tokens_mask`.
return_special_tokens_mask=True,
)
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=data_args.preprocessing_num_workers,
remove_columns=[text_column_name],
load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on dataset line_by_line",
)
else:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
# efficient when it receives the `special_tokens_mask`.
def tokenize_function(examples):
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
with training_args.main_process_first(desc="dataset map tokenization"):
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on every text in dataset",
)
# Main database_preparation processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
if total_length >= max_seq_length:
total_length = (total_length // max_seq_length) * max_seq_length
# Split by chunks of max_len.
result = {
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
for k, t in concatenated_examples.items()
}
return result
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
# might be slower to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
with training_args.main_process_first(desc="grouping texts together"):
tokenized_datasets = tokenized_datasets.map(
group_texts,
batched=True,
num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache,
desc=f"Grouping texts in chunks of {max_seq_length}",
)
if training_args.do_train:
if "train" not in tokenized_datasets:
raise ValueError("--do_train requires a train dataset")
train_dataset = tokenized_datasets["train"]
if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples))
if training_args.do_eval:
if "validation" not in tokenized_datasets:
raise ValueError("--do_eval requires a validation dataset")
eval_dataset = tokenized_datasets["validation"]
if data_args.max_eval_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
# Data collator
# This one will take care of randomly masking the tokens.
pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length
if data_args.whole_word_mask:
data_collator = DataCollatorForWholeWordMask(
tokenizer=tokenizer,
mlm_probability=data_args.mlm_probability,
pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
)
logger.info("*** train with whole word ***")
else:
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm_probability=data_args.mlm_probability,
pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
)
logger.info("*** train with sub-word masking ***")
# Initialize our Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset=eval_dataset if training_args.do_eval else None,
tokenizer=tokenizer,
data_collator=data_collator,
)
# Training
if training_args.do_train:
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics
max_train_samples = (
data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
# Evaluation
if training_args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate()
max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
try:
perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
perplexity = float("inf")
metrics["perplexity"] = perplexity
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
def _mp_fn(index):
# For xla_spawn (TPUs)
main()
if __name__ == "__main__":
main()
\ No newline at end of file
# NLP in diagnostic texts from nephropathology
This python project was created as part of the article "Natural Language Processing in diagnostic texts from
nephropathology" and will be uploaded soon (refactor works in progress).
\ No newline at end of file
nephropathology".
The paper can be found [here](LINK).
The scripts ```database_preparation/data_preparation_pipeline.py```, ```TextClustering/clustering_pipeline.py```
and ```TextClassification/classification_pipeline.py``` gives an idea of how this project can be used with other datasets.
The scripts ```TextClustering/basedOn_BOW/kmeans_Diagnosis.py```,
```TextClustering/basedOn_BOW/HDBSCAN_Diagnosis.py``` and ```TextClassification/bow_classification.py```
can also process tf-idf vectorized corpora.
All other scripts can only process corpora that are not vectorized.
Feel free to use and adapt the scripts to your own needs.
## Requirements
For preprocessing, the project requires some nltk corporas:
```
import nltk
nltk.download('stopwords')
nltk.download('punkt')
```
\ No newline at end of file
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
tfds.disable_progress_bar()
from matplotlib import pyplot as plt
import pandas as pd
import os
import sys
from TextClassification.argsparse_classification_preamble import argsparse_preamble
import classification_metrics as cls_metrics
import time
from database_preparation.utils_labeled_datasets import get_splits_for_cross_val
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
#from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import Embedding
from keras_preprocessing.sequence import pad_sequences
from database_preparation.utils_labeled_datasets import text_label_files_to_labeled_dataset
from database_preparation.preprocess import print_meta_data
sys.path.append(os.getcwd())
def plot_graphs(history, metric):
plt.close()
plt.plot(history.history[metric])
plt.plot(history.history['val_'+metric])
plt.xlabel("Epochs")
plt.ylabel(metric)
plt.legend([metric, 'val_'+metric])
plt.show()
def save_graphs(history, metric, save_path):
plt.close()
plt.plot(history.history[metric])
plt.plot(history.history['val_' + metric])
plt.xlabel("Epochs")
plt.ylabel(metric)
plt.legend([metric, 'val_' + metric])
#plt.show()
plt.savefig(save_path, dpi=300)
print("generated "+save_path)
def dict2tf_dataset(dict):
return tf.data.TextLineDataset.from_tensor_slices(([text for text in dict['text']], dict['label']))
def train_test_updatemetrics(train_dataset, test_dataset, num_classes, metrics,
num_epochs=50, plot_loss=False,
plot_save_path="TextClassification/plots/CNN/CNN_loss.png"):
train_dataset = dict2tf_dataset(train_dataset)
test_dataset = dict2tf_dataset(test_dataset)
############### Create the text encoder ###################
VOCAB_SIZE = 5000
encoder = tf.keras.layers.TextVectorization(
max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))
vocab = np.array(encoder.get_vocabulary())
# encode data to word-indices:
X_train = []
y_train = []
X_test = []
y_test = []
for text, label in train_dataset:
X_train.append(encoder(text).numpy())
y_train.append(label)
for text, label in test_dataset:
X_test.append(encoder(text).numpy())
y_test.append(label)
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)
##### create the model: #####
# Padding the data samples to a maximum review length in words
max_words = 450
X_train = pad_sequences(X_train, maxlen=max_words)
X_test = pad_sequences(X_test, maxlen=max_words)
# Building the CNN Model
embedding_dim = 100
filter_amount = 32
filter_size = 3
model = Sequential() # initilaizing the Sequential nature for CNN model
model.add(Embedding(len(encoder.get_vocabulary()), embedding_dim, input_length=max_words, mask_zero=True))
model.add(Conv1D(filter_amount, filter_size, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',
optimizer=tf.keras.optimizers.Adam(1e-4),
metrics=["accuracy"])
# model.summary()
start = time.time()
# evaluate:
if plot_loss:
history = model.fit(X_train, y_train, epochs=num_epochs,
batch_size=128, verbose=2,
validation_data=(X_test, y_test),
validation_steps=1,
)
save_graphs(history, 'loss', plot_save_path)
else:
model.fit(X_train, y_train, epochs=num_epochs,
batch_size=128, verbose=2)
predictions = model.predict(X_test)
y_pred = np.argmax(predictions, axis=-1)
metrics.update_metrics(y_test, y_pred, True, start)
def main():
############# get labeled text data ###################
args = argsparse_preamble()
print("CNN Evaluation with corpus " + args.path2corpus + " and cluster set " + args.clustered_data)
print("infos about corpus:")
print_meta_data(args.path2corpus)
dataset = text_label_files_to_labeled_dataset(args.clustered_data, path2corpus=args.path2corpus)
num_classes = int(pd.DataFrame(dataset["label"]).nunique())
metrics = cls_metrics.ClassificationMetrics("CNN")
epochs = 100
folds = 10
for i, (train_dataset, test_dataset) in enumerate(get_splits_for_cross_val(dataset, folds)):
if args.loss_curve_check:
train_test_updatemetrics(train_dataset, test_dataset, num_classes, metrics,
epochs, plot_loss=True,
plot_save_path="TextClassification/plots/CNN/CNN_loss_"+str(i+1)+".png")
else:
print("====== CNN train/test run " + str(i+1) + "/" + str(folds) + " ======")
print(str(len(train_dataset["label"]))+" train documents")
print(str(len(test_dataset["label"])) + " test documents")
train_test_updatemetrics(train_dataset, test_dataset, num_classes, metrics, epochs)
if not args.loss_curve_check:
metrics.save_scores_to_disk(args.clustered_data)
metrics.pickle_object(args.clustered_data)
cls_metrics.print_results_as_latextable(metrics.json_file_path)
if __name__ == '__main__':
main()
\ No newline at end of file
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
import pandas as pd
import os
import sys
sys.path.append(os.getcwd())
from TextClassification.argsparse_classification_preamble import argsparse_preamble
import TextClassification.classification_metrics as cls_metrics
import time
from database_preparation.utils_labeled_datasets import get_splits_for_cross_val
from database_preparation.utils_labeled_datasets import text_label_files_to_labeled_dataset
from database_preparation.preprocess import print_meta_data
def save_graphs(history, metric, save_path):
plt.close()
plt.plot(history.history[metric])
plt.plot(history.history['val_' + metric])
plt.xlabel("Epochs")
plt.ylabel(metric)
plt.legend([metric, 'val_' + metric])
#plt.show()
plt.savefig(save_path, dpi=300)
print("generated "+save_path)
def dict2tf_dataset(dict):
return tf.data.TextLineDataset.from_tensor_slices(([text.lower() for text in dict['text']], dict['label']))
def train_test_updatemetrics(train_dataset, test_dataset, num_classes, metrics,
epochs=10, plot_loss=False,
plot_save_path="TextClassification/plots/RNN/RNN_loss.png"):
y_test = np.asarray(test_dataset['label'])
train_dataset = dict2tf_dataset(train_dataset)
test_dataset = dict2tf_dataset(test_dataset)
# Next shuffle the data for training and create batches of these (text, label) pairs:
BUFFER_SIZE = 10000
BATCH_SIZE = 64
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
for example, label in train_dataset.take(1):
'''print('text: ', example.numpy())
print('label: ', label.numpy())'''
pass
############### Create the text encoder ###################
VOCAB_SIZE = 5000
encoder = tf.keras.layers.TextVectorization(
max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))
vocab = np.array(encoder.get_vocabulary())
'''print("vocab info:")
print(vocab[:20])
print(len(encoder.get_vocabulary()))
encoded_example = encoder(example)[:3].numpy()
print(encoded_example)
for n in range(3):
print("Original: ", example[n].numpy())
print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
print()'''
##### create the model: #####
embedding_dim = 64
model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(
input_dim=len(encoder.get_vocabulary()),
output_dim=embedding_dim,
# Use masking to handle the variable sequence lengths
mask_zero=True),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
tf.keras.layers.Dense(embedding_dim, activation='relu'),
tf.keras.layers.Dense(num_classes, activation='softmax')
])
# model.summary()
# stacking 2 LSTM layers (seems to be much worse):
'''model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(num_classes, activation='softmax')
])'''
# All the layers after the Embedding support masking:
# print([layer.supports_masking for layer in model.layers])
# predict on a sample text without padding.
'''print("test prediction:")
sample_text = ('The movie was cool. The animation and the graphics '
'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions)'''
model.compile(loss='sparse_categorical_crossentropy',
optimizer=tf.keras.optimizers.Adam(1e-4),
metrics=["accuracy"])
start = time.time()
# evaluate:
if plot_loss:
history = model.fit(train_dataset, epochs=epochs,
validation_data=test_dataset,
validation_steps=2)
test_loss, test_acc = model.evaluate(test_dataset)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)
#save_graphs(history, 'accuracy')
save_graphs(history, 'loss', plot_save_path)
else:
model.fit(train_dataset, epochs=epochs)
predictions = model.predict(test_dataset)
y_pred = np.argmax(predictions, axis=-1)
metrics.update_metrics(y_test, y_pred, True, start)
def main():
############# get labeled text data ###################
# how to convert words 2 ids with gensim:
# words = corpora.Dictionary(diag_lst)
args = argsparse_preamble()
print("RNN Evaluation with corpus " + args.path2corpus + " and cluster set " + args.clustered_data)
print("infos about corpus:")
print_meta_data(args.path2corpus.replace('.pkl', '_meta.pkl'))
# dataset = load_labeled_dataset(args.clustered_data)
dataset = text_label_files_to_labeled_dataset(args.clustered_data, path2corpus=args.path2corpus)
num_classes = int(pd.DataFrame(dataset["label"]).nunique())
metrics = cls_metrics.ClassificationMetrics("RNN")
folds = 10
epochs = 70
for i, (train_dataset, test_dataset) in enumerate(get_splits_for_cross_val(dataset, folds)):
if args.loss_curve_check:
train_test_updatemetrics(train_dataset, test_dataset, num_classes,
metrics, epochs=epochs, plot_loss=True,
plot_save_path="TextClassification/plots/RNN/RNN_loss_"+str(i+1)+".png")
else:
print("====== RNN train/test run " + str(i + 1) + "/" + str(folds) + " ======")
print(str(len(train_dataset["label"])) + " train documents")
print(str(len(test_dataset["label"])) + " test documents")
train_test_updatemetrics(train_dataset, test_dataset, num_classes, metrics, epochs=epochs)
if not args.loss_curve_check:
metrics.save_scores_to_disk(args.clustered_data)
metrics.pickle_object(args.clustered_data)
cls_metrics.print_results_as_latextable(metrics.json_file_path)
if __name__ == '__main__':
main()
import argparse
import sys, os
def argsparse_preamble():
sys.path.append(os.getcwd())
parser = argparse.ArgumentParser()
parser.add_argument("--overwrite", action='store_true')#False: generate data only if it doesn already exist
parser.add_argument("--show_figures", action='store_true')
parser.add_argument("--clustered_data", default="HDBSCAN")
parser.add_argument("--path2corpus", default="database/bow_prepro_desc.pkl")
parser.add_argument("--loss_curve_check", action='store_true')
args = parser.parse_args()
return args
\ No newline at end of file
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import datasets
import torch
import pandas as pd
import numpy as np
from transformers import Trainer
from transformers import TrainingArguments
import os
import sys
import pyarrow as pa
sys.path.append(os.getcwd())
from TextClassification.argsparse_classification_preamble import argsparse_preamble
import TextClassification.classification_metrics as cls_metrics
from database_preparation.utils_labeled_datasets import get_splits_for_cross_val
from database_preparation.utils_labeled_datasets import text_label_files_to_labeled_dataset
from database_preparation.preprocess import print_meta_data
from database_preparation.utils_labeled_datasets import is_text_lst_tokenized, is_text_lst_tfidf_vectorized
args = argsparse_preamble()
models_save_path = "./TextClassification/models/bert_models_new"
if not os.path.isdir(models_save_path):
os.makedirs(models_save_path)
########## functions ##########
def train(train_set, test_set, classifier_save_path, base_bert_model,
overwrite=False, track_loss_curves=True, epochs=3,
learning_rate=5e-5, save_model=True, cuda_batch_size=8):
'''
trains and saves the model + train/test-data at classifier_save_path
'''
if save_model:
if os.path.isdir(classifier_save_path):
if overwrite:
print(classifier_save_path + " already exists! (overwriting old model!)")
else:
print(classifier_save_path + " already exists! (skipping training)")
return
# This will issue a warning about some of the pretrained weights not being used and some weights being randomly initialized.
# That’s because we are throwing away the pretraining head of the BERT model to replace it with a classification head which is randomly initialized.
# We will fine-tune this model on our task, transferring the knowledge of the pretrained model to it (which is why doing this is called transfer learning).
if test_set == None:
num_labels = len(np.unique(train_set["label"]))
else:
num_labels = len(np.unique(train_set["label"] + test_set["label"]))
model = AutoModelForSequenceClassification.from_pretrained(base_bert_model, num_labels=num_labels)
if torch.cuda.is_available():
batch_size = cuda_batch_size
else:
batch_size = 8
if track_loss_curves:
training_args = TrainingArguments(classifier_save_path + "/trainer",
overwrite_output_dir=True,
save_steps=2000,
do_train=True,
do_eval=True,
num_train_epochs=epochs,
evaluation_strategy='steps',
logging_steps=2000,
per_device_train_batch_size=batch_size,
learning_rate=learning_rate
)
else:
training_args = TrainingArguments(classifier_save_path + "/trainer",
overwrite_output_dir=True,
save_steps=2000,
num_train_epochs=epochs,
logging_steps=2000,
per_device_train_batch_size=batch_size,
learning_rate=learning_rate
)
print("training args: " + str(training_args.to_dict()))
print("device:" + str(training_args.device))
print("gpus: " + str(training_args.n_gpu))
trainer = Trainer(
model=model, args=training_args, train_dataset=train_set, eval_dataset=test_set
)
# training
train_result = trainer.train()
if track_loss_curves:
# compute train results
metrics = train_result.metrics
metrics["train_samples"] = len(train_set)
# save train results
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
# compute evaluation results
metrics = trainer.evaluate()
metrics["eval_samples"] = len(test_set)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
# save model
if save_model and test_set != None:
model.save_pretrained(classifier_save_path)
hf_data_dict = datasets.DatasetDict({"train": train_set, "test": test_set})
hf_data_dict.save_to_disk(classifier_save_path + "/tokenized_train_test_dataset")
return model
def evaluate_saved_model(classifier_path, metrics_obj):
# load model:
model = AutoModelForSequenceClassification.from_pretrained(classifier_path, from_tf=False)
# load tokenized datasets:
train_test_set = datasets.DatasetDict.load_from_disk(classifier_path + "/tokenized_train_test_dataset")
# train_set = train_test_set["train"]
test_set = train_test_set["test"]
evaluate(model, test_set, metrics_obj)
def evaluate(model, test_set, metrics_obj):
# just use default parameters
training_args = TrainingArguments("TextClassification/models/temp_trainer", evaluation_strategy="epoch",
overwrite_output_dir=True, )
trainer = Trainer(
model=model,
args=training_args,
# train_dataset=train_set,
eval_dataset=test_set
)
# print(trainer.evaluate())
predictions = trainer.predict(test_set)
preds = np.argmax(predictions.predictions, axis=-1)
metrics_obj.update_metrics(predictions.label_ids, preds, True)
def main():
if torch.cuda.is_available():
torch.cuda.empty_cache()
print("running with cuda")
label_sets = [args.clustered_data]
if is_text_lst_tokenized(args.path2corpus):
print(f"Error: {args.path2corpus} is a tokenized corpus. Please pass a not tokenized corpus!")
exit(1)
base_bert_models = ["./LanguageModelling/ger-patho-bert-2", "bert-base-german-cased"]
# base_bert_models = ["bert-base-german-cased"]
evaluate_test_set = True
do_train = True
test_run = False # runns k-fold cross validation with only one test run for each model
folds = 10
track_loss_curves = False
epochs = 4
save_model = False
cuda_batch_size = 2
for label_set in label_sets:
# train_test_dataset = dt.load_labeled_dataset(label_set)
train_test_dataset = text_label_files_to_labeled_dataset(args.clustered_data, path2corpus=args.path2corpus)
if train_test_dataset == None:
print("cant do bert training without data!")
sys.exit()
# pre-savetrain/test data for cross validation.
# to train and test each model with the same data.
k_train_test_sets = []
for (train_dataset, test_dataset) in get_splits_for_cross_val(train_test_dataset, folds):
k_train_test_sets.append(tuple((train_dataset, test_dataset)))
for base_bert_model in base_bert_models:
print(base_bert_model + " Evaluation with corpus " + args.path2corpus + " and cluster set " + label_set)
print("infos about corpus:")
print_meta_data(args.path2corpus)
# compose names, depending on label_set and base bert model:
if "./LanguageModelling/" in base_bert_model:
# is it a custom LM from our languagemodeling-folder?
name = base_bert_model.replace("./LanguageModelling/", "")
classifier_path = models_save_path + "/" + name + "_" + label_set + "_ClassificatonModel"
metrics = cls_metrics.ClassificationMetrics(name)
elif '/' in base_bert_model and not './' in base_bert_model:
name = base_bert_model.replace("/", "_")
classifier_path = models_save_path + "/" + name + "_" + label_set + "_ClassificatonModel"
metrics = cls_metrics.ClassificationMetrics(name)
elif "gottbert-base" in base_bert_model:
classifier_path = models_save_path + "/gottbert-base_" + label_set + "_ClassificatonModel"
metrics = cls_metrics.ClassificationMetrics("gottbert-base")
else: # germanbert
classifier_path = models_save_path + "/" + base_bert_model + "_" + label_set + "_ClassificatonModel"
metrics = cls_metrics.ClassificationMetrics(base_bert_model)
if save_model:
print("saving model at: ")
print(classifier_path)
# cross validation:
for i, (train_dataset, test_dataset) in enumerate(k_train_test_sets):
# convert to dataframe:
train_dataset_ds = datasets.Dataset(pa.Table.from_pandas(pd.DataFrame(train_dataset)))
test_dataset_ds = datasets.Dataset(pa.Table.from_pandas(pd.DataFrame(test_dataset)))
# tokenize
tokenizer = AutoTokenizer.from_pretrained(base_bert_model)
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
train_set = train_dataset_ds.map(tokenize_function, batched=True)
test_set = test_dataset_ds.map(tokenize_function, batched=True)
# train
if do_train:
print("==> training " + classifier_path + "_" + str(i))
model = train(train_set, test_set, classifier_path + "_" + str(i), base_bert_model,
track_loss_curves=track_loss_curves, epochs=epochs,
save_model=save_model, cuda_batch_size=cuda_batch_size)
# evaluate
if evaluate_test_set:
if save_model:
print("==> predicting test set with " + classifier_path + "_" + str(i))
evaluate_saved_model(classifier_path + "_" + str(i), metrics)
else:
print("==> predicting test set with " + classifier_path + "_" + str(i))
evaluate(model, test_set, metrics)
if test_run:
break
metrics.save_scores_to_disk(label_set)
metrics.pickle_object(label_set)
if __name__ == '__main__':
main()
# -*- coding: iso-8859-1 -*-
import os
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier # stochastic gradient descent (SGD) learning
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from TextClassification.argsparse_classification_preamble import argsparse_preamble
import database_preparation.utils_labeled_datasets as dt
from database_preparation.utils_labeled_datasets import get_splits_for_cross_val
import TextClassification.classification_metrics as cls_metrics
from database_preparation.preprocess import print_meta_data
from database_preparation.utils_labeled_datasets import is_text_lst_tfidf_vectorized
import pickle
import numpy as np
import pandas as pd
'''from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb'''
#expeeriment:
merge_classes = [(0, 1), (5, 7), (9, 10, 11), (6, 15)]
# for tfidf vectorizer
def identity(words):
return words
def create_pipeline(estimator, reduction=False, with_vectorizer=True):
'''
construct a pipeline with sklearn.pipeline
pased estimator will be the last element of the pipeline
using tfidf as vectorizer
'''
steps = []
if with_vectorizer:
steps.append(
('vectorize', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False))
)
if reduction:
steps.append((
'reduction', TruncatedSVD(n_components=1000)
))
# Add the estimator
steps.append(('classifier', estimator))
return Pipeline(steps)
def get_immediate_subdirectories(a_dir):
return [name for name in os.listdir(a_dir)
if os.path.isdir(os.path.join(a_dir, name))]
def cross_validate_with_bow_classifiers(label_set, fold_amount=10,
path2corpus="./database/bow_prepro_desc.pkl",
df_cases_file="database/df_cases.pkl"):
'''
cross validates passed label_set with text data saved in path2corpus and labels saved in df_cases_file.
path2corpus should be a list of reports, where each report is tokenized
or a list of tf-idf vectorized texts (of type scipy.sparse.csr.csr_matrix).
'''
if is_text_lst_tfidf_vectorized(path2corpus):
models = []
models.append(create_pipeline(MultinomialNB(), with_vectorizer=False))
models.append(create_pipeline(MLPClassifier(max_iter=300), with_vectorizer=False))
models.append(create_pipeline(LogisticRegression(), with_vectorizer=False))
models.append(create_pipeline(SGDClassifier(), with_vectorizer=False))
with open(path2corpus, 'rb') as f:
loaded_texts = pickle.load(f)
n = np.asarray(loaded_texts.todense().tolist())
from database_preparation.utils_labeled_datasets import text_label_2_labeled_dataset
df_cases = pd.read_pickle(df_cases_file)
dataset = text_label_2_labeled_dataset(n, df_cases["label_" + label_set])
else:
# create model-pipelines for cross-validation with different pipelines:
models = []
models.append(create_pipeline(SGDClassifier()))
models.append(create_pipeline(MultinomialNB()))
models.append(create_pipeline(LogisticRegression()))
models.append(create_pipeline(MLPClassifier(max_iter=300)))
'''models.append(make_pipeline_imb(TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
, RandomOverSampler(), SGDClassifier()))'''
# print(f"train models {[model['classifier'] for model in models]} with corpus {path2corpus} and cluster set {label_set}")
print("infos about corpus:")
print_meta_data(path2corpus)
dataset = dt.text_label_files_to_labeled_dataset(label_set, path2corpus=path2corpus,
df_cases_path=df_cases_file)
# in order to use same 10-fold-cross-splits for each model:
k_train_test_sets = []
for (train_dataset, test_dataset) in get_splits_for_cross_val(dataset,
fold_amount, merge_classes=None, oversample=False, stratified=True):
k_train_test_sets.append(tuple((train_dataset, test_dataset)))
# cross validate each model and save metrics:
for model in models:
print('running ' + str(model['classifier']))
name = model.named_steps['classifier'].__class__.__name__
if 'reduction' in model.named_steps:
name += " (TruncatedSVD)"
metrics = cls_metrics.ClassificationMetrics(name)
for i, (train_dataset, test_dataset) in enumerate(k_train_test_sets):
model.fit(train_dataset['text'], train_dataset['label'])
y_pred = model.predict(test_dataset['text'])
metrics.update_metrics(test_dataset['label'], y_pred, False)
metrics.save_scores_to_disk(label_set)
metrics.pickle_object(label_set)
df = metrics.classes_scores(-1)
print(df.to_latex().replace('{}', 'cluster'))
cls_metrics.print_results_as_latextable(metrics.json_file_path)
def main():
args = argsparse_preamble()
cross_validate_with_bow_classifiers(args.clustered_data, path2corpus=args.path2corpus)
if __name__ == '__main__':
main()
# -*- coding: iso-8859-1 -*-
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
import sys
import database_preparation.utils_labeled_datasets as dt
# for training validation:
import TextClassification.classification_metrics as cls_metrics
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import nltk
import datasets
import pyarrow as pa
import pickle
fold_amount = 10
#%%
# for tfidf vectorizer
def identity(words):
return words
def create_pipeline(estimator, reduction=False):
'''
construct a pipeline with sklearn.pipeline
pased estimator will be the last element of the pipeline
using tfidf as vectorizer
'''
steps = []
steps.append(
('vectorize', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False))
)
if reduction:
steps.append((
'reduction', TruncatedSVD(n_components=1000)
))
# Add the estimator
steps.append(('classifier', estimator))
return Pipeline(steps)
def cross_validate_with_simple_SVM(label_set, path2corpus = "./database/bow_prepro_diag.pkl", path2dfcases='./database/df_cases.pkl'):
"""
trains a simple SVM with the given data
returns 10-fold-cross-validated accuracy value
"""
print(f"Calculating SVM-classification performance of {label_set} cluster-setr "
f"with text corpus {path2corpus}.")
metrics = cls_metrics.ClassificationMetrics(label_set)
#print("train SVM for cluster " + label_set + " with " + path2corpus + ".")
text_lst = pd.read_pickle(path2corpus)
text1 = np.asarray(text_lst[0])
corpus_is_tokenized = bool(text1.ndim)
del text1, text_lst
if corpus_is_tokenized:
dataset = dt.text_label_files_to_labeled_dataset(label_set,
path2corpus
, path2dfcases, False)
else:
dataset_raw = dt.text_label_files_to_labeled_dataset(label_set,
path2corpus
, path2dfcases, False)
# tokenize
tokenized_texts = []
for t_text in dataset_raw['text']:
tokenized_texts.append(nltk.tokenize.word_tokenize(t_text, language='german'))
df = pd.DataFrame({'text': tokenized_texts, 'label': dataset_raw['label']})
dataset = datasets.Dataset(pa.Table.from_pandas(df))
# 10-fold crosss validation:
folds = KFold(n_splits=10, shuffle=False)
for i, (train_index, test_index) in enumerate(folds.split(list(range(len(dataset))))):
train_dataset = dataset[train_index]
test_dataset = dataset[test_index]
pipe = create_pipeline(SGDClassifier())
pipe.fit(train_dataset['text'], train_dataset['label'])
y_pred = pipe.predict(test_dataset['text'])
metrics.update_metrics(test_dataset['label'], y_pred, False)
# train_save_SVM_for_clusterset_evaluation(label_set)
# metrics.save_scores_to_disk("diagnose_texts_with_SGD")
return metrics
def cross_validate_label_corpus_with_simple_SVM(labels, path2corpus = "./database/bow_prepro_diag.pkl", sample = True):
"""
trains a simple SVM with the given data
returns 10-fold-cross-validated accuracy value
"""
texts = pd.read_pickle(path2corpus)
from database_preparation.utils_labeled_datasets import text_label_2_labeled_dataset
metrics = cls_metrics.ClassificationMetrics("temp")
#print("train SVM for cluster " + label_set + " with " + path2corpus + ".")
text_lst = pd.read_pickle(path2corpus)
text1 = np.asarray(text_lst[0])
corpus_is_tokenized = bool(text1.ndim)
del text1, text_lst
if corpus_is_tokenized:
dataset = text_label_2_labeled_dataset(texts,labels)
else:
dataset_raw = text_label_2_labeled_dataset(texts,labels)
# tokenize
tokenized_texts = []
for t_text in dataset_raw['text']:
tokenized_texts.append(nltk.tokenize.word_tokenize(t_text, language='german'))
df = pd.DataFrame({'text': tokenized_texts, 'label': dataset_raw['label']})
dataset = datasets.Dataset(pa.Table.from_pandas(df))
# 10-fold crosss validation:
folds = KFold(n_splits=10, shuffle=False)
for i, (train_index, test_index) in enumerate(folds.split(list(range(len(dataset))))):
train_dataset = dataset[train_index]
test_dataset = dataset[test_index]
pipe = create_pipeline(SGDClassifier())
pipe.fit(train_dataset['text'], train_dataset['label'])
y_pred = pipe.predict(test_dataset['text'])
metrics.update_metrics(test_dataset['label'], y_pred, False)
if sample:
return metrics.scores['accuracy']
# train_save_SVM_for_clusterset_evaluation(label_set)
# metrics.save_scores_to_disk("diagnose_texts_with_SGD")
return np.mean(metrics.scores['accuracy'])
def train_SVM_with_clusterset(label_set, path2corpus = "./database/bow_prepro_diag.pkl", path2dfcases='./database/df_cases.pkl'):
"""
trains ans saves a svm, trained with the whole data under as:
"./ModelTestingAndExplaining/models/SVM_trained_with_" + label_set + "_clustered.pkl"
"""
print("train SVM for cluster " + label_set + " with " + path2corpus + ".")
text_lst = pd.read_pickle(path2corpus)
text1 = np.asarray(text_lst[0])
corpus_is_tokenized = bool(text1.ndim)
del text1, text_lst
if corpus_is_tokenized:
dataset = dt.text_label_files_to_labeled_dataset(label_set,
path2corpus
, path2dfcases, False)
else:
dataset_raw = dt.text_label_files_to_labeled_dataset(label_set,
path2corpus
, path2dfcases, False)
# tokenize
tokenized_texts = []
for t_text in dataset_raw['text']:
tokenized_texts.append(nltk.tokenize.word_tokenize(t_text, language='german'))
df = pd.DataFrame({'text': tokenized_texts, 'label': dataset_raw['label']})
dataset = datasets.Dataset(pa.Table.from_pandas(df))
pipe = create_pipeline(SVC(probability=True, kernel='linear'))
'''svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
pipe = make_pipeline(make_pipeline(
TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False),svd),
SVC(C=150, gamma=2e-2, probability=True))'''
pipe.fit(dataset['text'], dataset['label'])
path = "./ModelTestingAndExplaining/models/SVM_trained_with_" + label_set + "_clustered.pkl"
pickle.dump(pipe, open(path, 'wb'))
def update_cls_metric(label_set, cls_accuracy):
file_name = label_set + "_Diagnosis"
file_name = file_name.replace('KMeans', 'kmeans')
file_name = file_name.replace('d2v', 'doc2vec')
file_path = "TextClustering/cluster_metrics/" + file_name + ".pkl"
try:
scores = pd.DataFrame(pd.read_pickle(file_path))
except:
return
if 'cls accuracy' in scores.index:
scores[file_name]['cls accuracy'] = cls_accuracy
new_scores = scores
else:
vals = list(scores[file_name])
new_index = scores.index.append(pd.Index(['cls accuracy']))
vals.append(cls_accuracy)
new_scores = pd.DataFrame({file_name: vals}, index=new_index)
new_scores.to_pickle(file_path)
def update_cls_metric_for_each_clusterset():
'''
does 10-fold-cross-validation with a svm for each cluster-set saved in './database/df_cases.pkl'
using always the text in 'database/diag_lst_tokenized.pkl'
'''
label_sets = dt.get_all_label_set_ids()
# label_sets = ["German_BERT"]
for label_set in label_sets:
accuracy = np.mean(cross_validate_with_simple_SVM(label_set,
'database/diag_lst_tokenized.pkl',
'./database/df_cases.pkl').scores['accuracy'])
print("svm-cls-accuracy of cluster set "+label_set+": "+str(accuracy))
update_cls_metric(label_set, accuracy)
def main():
#update_cls_metric_for_each_clusterset()
cluster_set_name = "German_BERT"
#text_data = 'database/darmischaemie_prostata_txt_lst.pkl' cluster_set_dict = './database/df_cases2.pkl'
text_data = 'database/diag_lst.pkl'
#text_data = 'database/diag_lst_tokenized.pkl'
cluster_set_dict = './database/df_cases.pkl'
train_SVM_with_clusterset(cluster_set_name, text_data, cluster_set_dict)
if __name__ == '__main__':
main()
import time
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, cohen_kappa_score
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import pandas as pd
import pickle
sys.path.append(os.getcwd())
class ClassificationMetrics(object):
def __init__(self, model_name, metrics_save_name="metrics_new", **kwargs):
self.scores = {
'name': model_name,
'fold_amount': 0,
'accuracy': [],
'precision': [],
'recall': [],
'f1': [],
'cohen_kappa': [],
'time': [],
}
self.y_preds = []
self.y_tests = []
# create classification-metrics folder if not exist:
if not os.path.isdir("./TextClassification/cls_metrics"):
os.makedirs("./TextClassification/cls_metrics")
# create subfolder for our metrics if not exist:
self.metrics_path = "./TextClassification/cls_metrics/"+metrics_save_name+"/"
if not os.path.isdir(self.metrics_path):
os.makedirs(self.metrics_path)
# save paths:
self.json_file_path = "none"
self.object_dir = "none"
def update_metrics(self, y_test, y_pred, print_cls_report=False, start_time=None):
'''
call this for each test run if you do k-fold-cross-validation
'''
if print_cls_report:
print(classification_report(y_test, y_pred))
self.y_preds.append(y_pred)
self.y_tests.append(y_test)
self.scores['fold_amount'] += 1
if start_time != None:
self.scores['time'].append(time.time() - start_time)
else:
self.scores['time'].append(-1)
self.scores['accuracy'].append(accuracy_score(y_test, y_pred))
# the ability of the classifier not to label as positive a sample that is negative - tp / (tp + fp)
# -> precision = 1 -> This class was detected perfectly. There are only TPs! (true positives)
# -> precision = 0.75 -> There are sine false positives! -> sometimes the machine thought it was class A, but it wasn't
self.scores['precision'].append(precision_score(y_test, y_pred, average='weighted'))
# the ability of the classifier to find all the positive samples - tp / (tp + fn)
self.scores['recall'].append(recall_score(y_test, y_pred, average='weighted'))
self.scores['f1'].append(f1_score(y_test, y_pred, average='weighted'))
# cohen_kappa = fleiss kappa with 2 raters?
# The kappa score measures the degree of agreement between
# the two evaluators, also known as inter-rater reliability
self.scores['cohen_kappa'].append(cohen_kappa_score(y_test, y_pred))
def clean_class_score_table(self, df):
df.drop(['accuracy', 'macro avg', 'weighted avg'], 1, inplace=True)
df.drop(['precision', 'recall'], 0, inplace=True)
df = df.T
# round f1-values
for i, x in enumerate(df['f1-score']):
df['f1-score'][i] = round(x, 3)
# edit suport entries:
integer_support = [str(x)[:-2] for x in df['support']]
df['support'] = integer_support
df.sort_values(by=['f1-score'], inplace=True, ascending=False)
return df
def get_merged_predictions(self):
merged_y_tests = []
merged_y_preds = []
for i in range(0, len(self.y_tests)):
for x in self.y_preds[i]:
merged_y_preds.append(x)
for y in self.y_tests[i]:
merged_y_tests.append(y)
return merged_y_tests, merged_y_preds
def classes_scores(self, prediction_set=0):
'''
returns some scored for each class
'''
if prediction_set < 0:
merged_y_tests, merged_y_preds = self.get_merged_predictions()
dic = classification_report(merged_y_tests, merged_y_preds,
output_dict=True)
df = pd.DataFrame(dic)
return self.clean_class_score_table(df)
else:
dic = classification_report(self.y_tests[prediction_set], self.y_preds[prediction_set],
output_dict=True)
df = pd.DataFrame(dic)
return self.clean_class_score_table(df)
def plot_confusion_matrix(self, labels, prediction_set=0, plot=False, save=True,
filename='confusion_matrix', title=None,
normalized=True, annot = False, colormap='gray'):
if title == None:
title = filename
if prediction_set < 0:
y_test, y_pred = self.get_merged_predictions()
else:
y_test = self.y_tests[prediction_set]
y_pred = self.y_preds[prediction_set]
try:
conf_matrix = np.asarray(confusion_matrix(y_test , y_pred,labels=labels),dtype=float)
if normalized:
for y, row in enumerate(conf_matrix[:]):
sum_apperiance = np.sum(row)
for x, pred_amount in enumerate(row):
if sum_apperiance == 0:
row[x] = 0
else:
row[x] = round(pred_amount / sum_apperiance,2)
conf_matrix[y] = row
except ValueError:
if labels[0] == 0:
labels2=['class'+str(a) for a in range(len(labels))]
try:
conf_matrix = confusion_matrix(y_test, y_pred, labels=labels2)
except ValueError:
print("confusion_matrix generation failed.")
print("labels:")
print(labels)
print("y_test:")
print(self.y_tests[prediction_set])
print("y_pred:")
print(self.y_preds[prediction_set])
return
else:
print("confusion_matrix generation failed.")
print("labels:")
print(labels)
print("y_test:")
print(self.y_tests[prediction_set])
print("y_pred:")
print(self.y_preds[prediction_set])
return
#print(conf_matrix)
df_cm = pd.DataFrame(conf_matrix, labels, labels)
sn.set(font_scale=1.4) # for label size
if plot or save:
plt.close()
if normalized:
hm = sn.heatmap(df_cm, annot=annot, vmin=0, vmax=1, cmap=colormap) # this makes problems???
else:
hm = sn.heatmap(df_cm, annot=annot, annot_kws={"size": 10}, cmap=colormap)
plt.xlabel("predicted", fontsize=14)
plt.ylabel("true", fontsize=14)
plt.title(title, fontsize=16)
if plot:
plt.show()
if save:
figure = hm.get_figure()
save_path = "TextClassification/plots/"+filename+".png"
try:
figure.savefig(save_path, dpi=300)
print("generated "+save_path)
except FileNotFoundError:
os.mkdir("TextClassification/plots")
figure.savefig(save_path, dpi=300)
print("generated " + save_path)
def save_scores_to_disk(self, labelset):
'''
if file already exists, we append the new score as new row
'''
# save scores as table, appending if modelname already exist:
self.json_file_path = self.metrics_path+labelset+"_clustered_all_classifiers.json"
if os.path.isfile(self.json_file_path):
# add number to name, if model appears already in json file:
with open(self.json_file_path, 'r') as f:
amount_same_name = 0
for line in f:
scores = json.loads(line)
if self.scores['name'] in scores["name"]:
amount_same_name += 1
if amount_same_name > 0:
self.scores['name']=self.scores['name']+"_"+str(amount_same_name+1)
with open(self.json_file_path, 'a') as f:
f.write(json.dumps(self.scores) + "\n")
def pickle_object(self, labelset, model_name='default'):
# pickles whole object
if model_name == 'default':
model_name = self.scores['name']
self.object_dir = self.metrics_path + labelset + "_clustered_" + model_name + "_classified.pickle"
with open(self.object_dir, 'wb') as f:
pickle.dump(self, f)
def print_results_as_latextable(jsonfile, print_only_f1_kappa=True):
'''
returns the results as latex table.
expecting a jsonfile (path to json file) as saved my the metrics object
you can optain the jsonfile of a metrics object via metrics.json_file_path
'''
if print_only_f1_kappa:
print("================== " + jsonfile + " ==================")
fields = [key for key in ClassificationMetrics(None).scores.keys()]
to_remove = ["fold_amount","accuracy","precision","recall"]
for remove in to_remove:
fields.remove(remove)
table = []
with open(jsonfile, 'r') as f:
for idx, line in enumerate(f):
scores = json.loads(line)
row = [scores['name']]
for field in fields[1:]:
row.append("{:0.3f}".format(np.mean(scores[field])))
table.append(row)
# sort over f1 score:
table.sort(key=lambda r: r[1], reverse=True)
# print(tabulate.tabulate(table, headers=fields))
else:
print("================== " + jsonfile + " ==================")
fields = [key for key in ClassificationMetrics(None).scores.keys()]
table = []
with open(jsonfile, 'r') as f:
for idx, line in enumerate(f):
scores = json.loads(line)
row = [scores['name'], scores['fold_amount']]
for field in fields[2:]:
row.append("{:0.3f}".format(np.mean(scores[field])))
table.append(row)
# sort over f1 score:
table.sort(key=lambda r: r[5], reverse=True)
# print(tabulate.tabulate(table, headers=fields))
# export it to df and than to latex table:
df = pd.DataFrame(columns=fields)
for i, field in enumerate(fields):
# df.append()
df[field] = [e[i] for e in table]
df.drop(columns=['time'], axis=1, inplace=True)
as_latex = df.to_latex(index=False)
print(as_latex)
return as_latex
def main():
y_true = [0,1,0,1,0,2]
y_pred = [1,1,0,1,0,2]
metrics = ClassificationMetrics("metrics_test")
metrics.update_metrics(y_true,y_pred)
metrics.save_scores_to_disk("testitest")
metrics.pickle_object("testitest")
metrics.plot_confusion_matrix([i for i in range(3)],0,True,True)
if __name__ == "__main__":
main()
import os
####### pieline parameters ################
#cluster_sets = ['KMeans', 'LDA', 'HDBSCAN', 'GSDPMM', 'German_BERT', 'Patho_BERT', 'top2vec']
cluster_sets = ['HDBSCAN']
# params:
path2corpus_bow_preprocessed = 'database/bow_prepro_desc.pkl'
path2corpus_embedding_preprocessed = 'database/embedding_prepro_desc.pkl'
#check working directory:
workdir = os.getcwd()
if not workdir[-len('nlp-in-diagnostic-texts-from-nephropathology'):] == 'nlp-in-diagnostic-texts-from-nephropathology':
print(workdir + " is the wrong working directory.")
print("please make shure to run this script with working directory '.../path/to/nlp-in-diagnostic-texts-from-nephropathology'.")
exit(1)
for cluster_set in cluster_sets:
script_queue = [
f"python TextClassification/bow_classification.py --clustered_data {cluster_set} --path2corpus {path2corpus_bow_preprocessed}",
f"python TextClassification/RNN_classification.py --clustered_data {cluster_set} --path2corpus {path2corpus_embedding_preprocessed}",
f"python TextClassification/CNN_classification.py --clustered_data {cluster_set} --path2corpus {path2corpus_embedding_preprocessed}",
#f"python TextClassification/bert_classification.py --clustered_data {cluster_set} --path2corpus {path2corpus_embedding_preprocessed}",
f"python TextClassification/print_classification_metrics.py --clustered_data {cluster_set}"
]
for script in script_queue:
print("\n########################################### executing ###########################################")
print(script)
print("####################################################################################################\n")
os.system(script)
\ No newline at end of file
import matplotlib.pyplot as plt
import math
import json
import argparse
def plot_loss_curve(path2json, title='loss'):
with open(path2json) as f:
log_history = json.load(f)["log_history"]
# Keep track of train and evaluate loss.
loss_history = {'train_loss': [], 'eval_loss': [],
'train_steps': [], 'train_epochs': [],
'eval_steps': [], 'eval_epochs': []}
# Keep track of train and evaluate perplexity.
# This is a metric useful to track for language models.
perplexity_history = {'train_perplexity': [], 'eval_perplexity': []}
for log in log_history:
if 'loss' in log.keys():
# Deal with trianing loss.
loss_history['train_loss'].append(log['loss'])
perplexity_history['train_perplexity'].append(math.exp(log['loss']))
loss_history['train_epochs'].append(log["epoch"])
loss_history['train_steps'].append(log["step"])
elif 'eval_loss' in log.keys():
# Deal with eval loss.
loss_history['eval_loss'].append(log['eval_loss'])
perplexity_history['eval_perplexity'].append(math.exp(log['eval_loss']))
loss_history['eval_epochs'].append(log["epoch"])
loss_history['eval_steps'].append(log["step"])
# Plot Losses.
plt.figure()
plt.plot(loss_history['eval_epochs'], loss_history["eval_loss"],
label="eval loss")
plt.plot(loss_history['train_epochs'], loss_history["train_loss"],
label="train loss")
plt.xlabel("epoch", fontsize=14)
plt.ylabel("loss", fontsize=14)
plt.title(title, fontsize=16)
plt.grid(True)
plt.legend()
plt.show()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--path_to_trainer_state_file",
default='./LanguageModelling/ger-patho-bert-w3/trainer_state.json')
args = parser.parse_args()
# example how to plot loss curve:
plot_loss_curve(args.path_to_trainer_state_file,
args.path_to_trainer_state_file.replace('/trainer_state.json',''))
if __name__ == '__main__':
main()
\ No newline at end of file
import TextClassification.classification_metrics as cls_metrics
import glob
import sys, os
from TextClassification.argsparse_classification_preamble import argsparse_preamble
import pickle
import database_preparation.utils_labeled_datasets as dt
sys.path.append(os.getcwd())
# script parameters:
metrics_folder = "cls_metrics/metrics_new"
def generate_save_conf_matrix(model_name, clustered_data,
test_set_index):
ob_dir = "./TextClassification/" + metrics_folder + "/" \
+ clustered_data + "_clustered_" + model_name + "_classified.pickle"
try:
with open(ob_dir, 'rb') as f:
metrics = pickle.load(f)
except FileNotFoundError:
return False
# plot confusion matrix
if "ger-patho-bert" in model_name:
titlename = "Patho-BERT"
elif "german" in model_name:
titlename = "German-BERT"
elif "SGD" in model_name:
titlename = "SGD-classifier"
elif "MLP" in model_name:
titlename = "MLP-classifier"
else:
titlename = model_name
unique_labels = dt.get_amount_unique_labels(clustered_data)
labels = [a for a in range(unique_labels)]
# labels=['class'+str(a) for a in range(unique_labels)]
# https://matplotlib.org/3.5.1/tutorials/colors/colormaps.html
metrics.plot_confusion_matrix(labels, prediction_set=test_set_index,
plot=False, save=True,
filename="confmatrx_"+clustered_data + "_clustered_" + model_name + "_classified",
title= titlename, normalized=True, annot = False, colormap='gist_heat')
return True
def print_f1_per_clusters(model_name, clustered_data):
# print f1-scores for each class of a single test run
ob_dir = "./TextClassification/" + metrics_folder + "/" \
+ clustered_data + "_clustered_" + model_name + "_classified.pickle"
try:
with open(ob_dir, 'rb') as f:
metrics = pickle.load(f)
except FileNotFoundError:
return False
print("================ model: " + model_name + " | cluster-set: " + clustered_data + " ================")
df = metrics.classes_scores(-1)
print(df.to_latex().replace('{}', 'cluster'))
return True
def main():
args = argsparse_preamble()
model_names = ['SGDClassifier', 'MLPClassifier', 'ger-patho-bert-2', 'bert-base-german-cased',
'CNN', 'RNN', 'LogisticRegression', 'MultinomialNB']
# print f1 scores for each classification model:
for model in model_names:
print_f1_per_clusters(model, args.clustered_data)
# print classification overview tables for each clustering method:
print()
file_list = glob.glob("./TextClassification/" + metrics_folder + "/" + '/*.json')
for file in file_list:
cls_metrics.print_results_as_latextable(file, True)
print()
# generate all confusion matrices for each classification model:
for model in model_names:
try:
generate_save_conf_matrix(model, args.clustered_data, -1)
except:
print(f"cant generate conf matrix for {model}")
if __name__ == '__main__':
main()
\ No newline at end of file
import argparse
import sys, os
def argsparse_preamble():
sys.path.append(os.getcwd())
parser = argparse.ArgumentParser()
parser.add_argument("--find_k_value", action='store_true')
parser.add_argument("--k_value", type=int, default=10)
parser.add_argument("--show_figures", action='store_true')
parser.add_argument("--model2use", default="German_BERT")
parser.add_argument('--do_embedding', action='store_true')
parser.add_argument("--path2corpus", default='database/bow_prepro_diag.pkl')
parser.add_argument("--df_cases_file", default='database/df_cases.pkl')
args = parser.parse_args()
return args
#%%
from __future__ import unicode_literals, print_function, division
import csv
import numpy as np
class GSDPMM:
def __init__(self, K, alpha, beta, iterNum, dataset):
self.K=K
self.alpha=alpha
self.beta=beta
self.iterNum=iterNum
self.dataset=dataset
self.docu_set=docu_set(self.dataset)
self.docu_num=self.docu_set.docu_num
self.V=self.docu_set.V
self.alpha0=K*self.alpha
self.beta0=self.V*beta
self.m_z=np.zeros(K,dtype=np.int)
self.n_z=np.zeros(K,dtype=np.int)
self.n_zv=np.zeros([K,self.V],dtype=np.int)
self.z_c=np.zeros(self.docu_num,dtype=np.int)
self.num_list=self.docu_set.num_list
self.wordid_array=self.docu_set.wordid_array
self.wordfreq_array=self.docu_set.wordfreq_array
self.largedouble=1e100
self.smalldouble=1e-100
def initialize(self):
for d in range(self.docu_num):
self.z_c[d]=int(np.floor(self.K*np.random.uniform()))
cluster=self.z_c[d]
self.m_z[cluster]=self.m_z[cluster]+1
for w in range(len(self.num_list[d])):
self.n_zv[cluster][self.num_list[d][w]]=self.n_zv[cluster][self.num_list[d][w]]+1
self.n_z[cluster]=self.n_z[cluster]+1
def gibbs_sampling(self):
for i in range(self.iterNum):
for d in range(self.docu_num):
cluster=self.z_c[d]
self.m_z[cluster]=self.m_z[cluster]-1
for w in range(len(self.num_list[d])):
self.n_zv[cluster][self.num_list[d][w]]=self.n_zv[cluster][self.num_list[d][w]]-1
self.n_z[cluster]=self.n_z[cluster]-1
cluster=self.sample_cluster(d)
self.z_c[d]=cluster
self.m_z[cluster]=self.m_z[cluster]+1
for w in range(len(self.num_list[d])):
self.n_zv[cluster][self.num_list[d][w]]=self.n_zv[cluster][self.num_list[d][w]]+1
self.n_z[cluster]=self.n_z[cluster]+1
print(f'iteration {i}/{self.iterNum}')
def sample_cluster(self, d):
prob=np.zeros(self.K)
overflow_count=np.zeros(self.K)
for k in range(self.K):
prob[k]=(self.m_z[k]+self.alpha)/(self.docu_num+self.alpha0)
value2=1.0
i=0
for w in range(len(self.wordid_array[d])):
wordNo=self.wordid_array[d][w]
wordfreq=self.wordfreq_array[d][w]
for j in range(wordfreq):
value2=value2*(self.n_zv[k][wordNo]+self.beta+j)/(self.n_z[k]+self.beta0+i)
i=i+1
if value2<self.smalldouble:
overflow_count[k]=overflow_count[k]-1
value2=value2*self.largedouble
prob[k]=prob[k]*value2
self.recompute_prob(prob, overflow_count, self.K)
for k in range(1,self.K):
prob[k]=prob[k-1]+prob[k]
sample=np.random.uniform()*prob[self.K-1]
kchoosed=0
for kchoosed in range(self.K):
if sample<prob[kchoosed]:
break
return kchoosed
def recompute_prob(self, prob, overflow_count, K):
max_common=-1e20
for k in range(K):
if overflow_count[k]>max_common and prob[k]>0:
max_common=overflow_count[k]
for k in range(K):
if prob[k]>0:
prob[k]=prob[k]*pow(self.largedouble,overflow_count[k]-max_common)
class docu_set:
def __init__(self, dataset):
self.docu_num=0
self.docs=[]
self.result=self.read_data(dataset)
self.lines=self.result[0]
self.wordtoId={}
self.wordfreq={}
self.V=len(self.wordtoId)
self.num_list, self.wordid_array, self.wordfreq_array=self.convert_to_numlist()
def read_data(self,filename):
data=[]
target=[]
with open(filename,'r') as csvfile:
line_reader=csv.reader(csvfile)
for line in line_reader:
data.append(line[2])
#target.append(line[3])
self.docu_num=len(data)
print(len(data))
return [data,target]
def convert_to_numlist(self):
n_lines=len(self.lines)
num_list=[[] for i in range(n_lines)]
wordid_array=[[] for i in range(n_lines)]
wordfreq_array=[[] for i in range(n_lines)]
for i in range(n_lines):
this_line=self.lines[i]
split_line=this_line.split()
for j in range(len(split_line)):
if split_line[j] in self.wordtoId:
self.wordfreq[self.wordtoId[split_line[j]]]=self.wordfreq[self.wordtoId[split_line[j]]]+1
Id=self.wordtoId.get(split_line[j])
if Id in wordid_array[i]:
wordfreq_array[i][wordid_array[i].index(Id)]+=1
else:
wordid_array[i].append(Id)
wordfreq_array[i].append(1)
else:
self.wordtoId[split_line[j]]=self.V
self.V=self.V+1
self.wordfreq[self.wordtoId[split_line[j]]]=1
Id=self.wordtoId.get(split_line[j])
if Id in wordid_array[i]:
wordfreq_array[i][wordid_array[i].index(Id)]+=1
else:
wordid_array[i].append(Id)
wordfreq_array[i].append(1)
num_list[i].append(self.wordtoId[split_line[j]])
return num_list, wordid_array, wordfreq_array
#%% argsparse section
import sys, os
sys.path.append(os.getcwd())
from TextClustering.argsparse_clustering_preamble import argsparse_preamble
args = argsparse_preamble()
from database_preparation.utils_labeled_datasets import is_text_lst_tokenized
#%% import section
import pickle
from TextClustering.basedOn_BOW.GSDPMM import *
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import umap
from tqdm import tqdm
from TextClustering.utils_metrics import ClusterMetrics
from database_preparation.preprocess import print_meta_data
from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
#%% load the data
with open(args.path2corpus, 'rb') as f:
diag_lst = pickle.load(f)
print_meta_data(args.path2corpus)
#%% and save it for DPMM
text = ['text'] * len(diag_lst)
if is_text_lst_tokenized(args.path2corpus):
text_tupls = list(zip(text, [' '.join(tokenized_text) for tokenized_text in diag_lst]))
else:
text_tupls = list(zip(text, [text for text in diag_lst]))
df = pd.DataFrame(text_tupls)
df.to_csv('TextClustering/basedOn_BOW/temp.csv', header=None)
def identity(word):
return word
def create_vectorizer(data):
vec = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
return vec.fit_transform(data)
text_features = create_vectorizer(diag_lst)
#%% find the best hyperparameter
if args.find_k_value:
# %% set the parameter
args.alpha = 0.3
args.beta = 0.02
args.iterNum = 5
args.dataset = 'TextClustering/basedOn_BOW/temp.csv'
beta_list = np.arange(3,23,1)
s_score, n_cluster, svm_scores = [], [], []
n_steps = []
for i in tqdm(beta_list):
#%% initialize it
gsdmm = GSDPMM(i,
args.alpha, args.beta,
args.iterNum,
args.dataset)
gsdmm.initialize()
# %% actually do it
gsdmm.gibbs_sampling()
#%% evalute the model
evaluation = ClusterMetrics(text_features, gsdmm.z_c)
s_score.append(evaluation.s_score)
svm_scores.append(
cross_validate_label_corpus_with_simple_SVM(gsdmm.z_c, args.path2corpus + '.pkl',
False))
n_cluster.append(len(np.unique(gsdmm.z_c)))
n_steps.append(i)
#%% plot it
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax3 = ax1.twinx()
ax1.plot(n_steps, s_score, 'bx-')
ax2.plot(n_steps, n_cluster, 'rx-')
ax3.plot(n_steps, svm_scores, 'gx-')
ax1.set_xlabel('Minimal cluster size')
ax1.yaxis.label.set_color('blue')
ax1.set_ylabel('s-score')
ax2.yaxis.label.set_color('red')
ax2.set_ylabel('Number of clusters')
ax3.yaxis.label.set_color('green')
ax3.set_ylabel('svm accuracy')
plt.title('Ellbow-method-like plot')
plt.savefig("TextClustering/plots/elbow_method/GSDPMM_elbow_plot.png", dpi=300)
plt.show()
sys.exit()
#%% set the parameter
args.alpha = 0.3
args.beta = 0.02
args.iterNum = 5
args.dataset = 'TextClustering/basedOn_BOW/temp.csv'
gsdmm=GSDPMM(args.k_value,
args.alpha, args.beta,
args.iterNum,
args.dataset)
gsdmm.initialize()
#%% actually do it
gsdmm.gibbs_sampling()
#%% retrieve the results
A=gsdmm.z_c
num_list=gsdmm.num_list
m_z=gsdmm.m_z
n_z=gsdmm.n_z
n_zv=gsdmm.n_zv
docu_num=gsdmm.docu_num
predictedCluster=gsdmm.z_c
wordid_array=gsdmm.wordid_array
wordfreq_array=gsdmm.wordfreq_array
#%% save UMAP data points:
umap_text_features2D = umap.UMAP(n_neighbors=15,
n_components=2,
min_dist=0.0, metric='cosine').fit_transform(text_features)
# save umaped vectors and labels:
df = pd.read_pickle(args.df_cases_file)
df['umapX_GSDPMM'] = umap_text_features2D[:, 0]
df['umapY_GSDPMM'] = umap_text_features2D[:, 1]
df['label_GSDPMM'] = predictedCluster
df.to_pickle(args.df_cases_file)
#%% evalute the model
from TextClustering.utils_metrics import ClusterMetrics
evaluation = ClusterMetrics(text_features, predictedCluster,
file_name= "TextClustering/cluster_metrics/GSDPMM_metrics.pkl")
evaluation.write_to_file()
#%% argsparse section
import sys, os
sys.path.append(os.getcwd())
from TextClustering.argsparse_clustering_preamble import argsparse_preamble
args = argsparse_preamble()
from database_preparation.utils_labeled_datasets import is_text_lst_tokenized, is_text_lst_tfidf_vectorized
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import umap
import hdbscan
import numpy as np
from nltk import RegexpTokenizer
from TextClustering.utils_metrics import ClusterMetrics
from database_preparation.preprocess import print_meta_data
from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
tokenizer = RegexpTokenizer(r'\w+')
#%% load the data
with open(args.path2corpus, 'rb') as f:
diag_lst = pickle.load(f)
print_meta_data(args.path2corpus)
def identity(word):
return word
text_is_vectorized = is_text_lst_tfidf_vectorized(args.path2corpus)
if not (is_text_lst_tokenized(args.path2corpus) or text_is_vectorized):
print("Error: " + args.path2corpus + ' has to be a list of tokenized texts or tfidf-vectorized texts!')
exit(1)
def create_vectorizer(data):
vec = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
vec = vec.fit_transform(data)
return vec
if text_is_vectorized:
text_features = diag_lst
else:
text_features = create_vectorizer(diag_lst)
#%% perform umap for dimension-reduction (for cluster-detection)
umap_text_features = umap.UMAP(n_neighbors=15,
n_components=5,
metric='cosine').fit_transform(text_features)
# and perform umap-dimension-reduction for visulatizaton
umap_text_features2D = umap.UMAP(n_neighbors=15,
n_components=2,
min_dist=0.0, metric='cosine').fit_transform(text_features)
if args.find_k_value:
# %% perform hdbscan for cluster-dectection with different cluster sizes to find a good solution...by eye..
list_cluster_size = [int(k) for k in np.arange(3, 23, 1)]
s_score, n_cluster, svm_scores = [], [], []
for i_cluster_size in list_cluster_size:
cluster = hdbscan.HDBSCAN(min_cluster_size=i_cluster_size,
metric='euclidean',
cluster_selection_method='eom').fit(umap_text_features)
result = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
result['labels'] = cluster.labels_.tolist() # cluster.labels_
print(np.unique(result.labels))
#%% Visualize clusters
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
clustered['labels'] = [str(i) for i in clustered['labels']]
evaluation = ClusterMetrics(umap_text_features, cluster.labels_.tolist())
s_score.append(evaluation.s_score)
svm_scores.append(
cross_validate_label_corpus_with_simple_SVM(cluster.labels_.tolist(), args.path2corpus + '.pkl',
False))
n_cluster.append(len(np.unique(cluster.labels_.tolist())))
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax3 = ax1.twinx()
ax1.plot(list_cluster_size, s_score, 'bx-')
ax2.plot(list_cluster_size, n_cluster, 'rx-')
ax3.plot(list_cluster_size, svm_scores, 'gx-')
ax1.set_xlabel('Minimal cluster size')
ax1.yaxis.label.set_color('blue')
ax1.set_ylabel('s-score')
ax2.yaxis.label.set_color('red')
ax2.set_ylabel('Number of clusters')
ax3.yaxis.label.set_color('green')
ax3.set_ylabel('svm accuracy')
plt.title('Ellbow-method-like plot')
plt.savefig("TextClustering/plots/elbow_method/HDBSCAN_elbow_plot.png", dpi=300)
plt.show()
exit()
#%% perform hdbscan with best cluster size
cluster = hdbscan.HDBSCAN(min_cluster_size=args.k_value,
metric='euclidean',
cluster_selection_method='eom').fit(umap_text_features)
result = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
result['labels'] = cluster.labels_.tolist() # cluster.labels_
clusters = np.int8([str(i) for i in result['labels']])
outliers = result.loc[result.labels == -1, :]
clusters_no_outliers = result.loc[result.labels != -1, :]
unique_clusters = np.unique(result.labels)
print(f"\nfound {len(unique_clusters[unique_clusters>-1])} clusters.\n")
# save umaped vectors:
df = pd.read_pickle(args.df_cases_file)
df['umapX_HDBSCAN'] = umap_text_features2D[:, 0]
df['umapY_HDBSCAN'] = umap_text_features2D[:, 1]
df['label_HDBSCAN'] = clusters
df.to_pickle(args.df_cases_file)
#%% and evaluate the results with several metrics (not needing ground truth)
evaluation = ClusterMetrics(umap_text_features[result.labels >= 0,], clusters_no_outliers.labels.tolist(),
file_name= "TextClustering/cluster_metrics/HDBSCAN_metrics.pkl")
evaluation.write_to_file()
#%% argsparse section
import sys, os
sys.path.append(os.getcwd())
from TextClustering.argsparse_clustering_preamble import argsparse_preamble
args = argsparse_preamble()
from database_preparation.utils_labeled_datasets import is_text_lst_tokenized
if not is_text_lst_tokenized(args.path2corpus):
print("Error: "+args.path2corpus + '.pkl is not tokenized! '
'Please pass texts list where each text is tokenized (a list of words).')
exit(1)
#%% import section
import pickle
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel
from tqdm import tqdm
from database_preparation.preprocess import print_meta_data
from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
#%% load the diag and main_diag list
with open(args.path2corpus, 'rb') as f:
diag_lst = pickle.load(f)
print_meta_data(args.path2corpus)
#%% prepare database_preparation for LDA-model-trainng
# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(diag_lst)
# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in diag_lst] #is that allready a model
#%% train LDA-model with different number of clusters
if args.find_k_value:
limit=21; start=5; step=1
coherence_values = []
model_list, n_cluster, svm_scores = [], [], []
for num_topics in tqdm(range(start, limit, step)):
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=words,
num_topics=num_topics,
random_state=5,
update_every=1,
passes=10,
alpha='auto',
per_word_topics=True)
coherencemodel = CoherenceModel(model=lda_model, texts=diag_lst, dictionary=words,
coherence='c_v', processes= 1)
coherence_values.append(coherencemodel.get_coherence())
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
topic_weights.append([w for i, w in row_list[0]])
predictedCluster = np.argmax(pd.DataFrame(topic_weights).fillna(0).values, axis=1)
svm_scores.append(
cross_validate_label_corpus_with_simple_SVM(predictedCluster, args.path2corpus,
False))
#n_cluster.append(len(lda_model.print_topics(num_words=3)))
n_cluster.append(len(np.unique(np.asarray(predictedCluster))))
print("coherence: " + str(coherencemodel.get_coherence()))
#%% visualize the results
x = range(start, limit, step)
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax3 = ax1.twinx()
ax1.plot(x, coherence_values,'bx-')
ax2.plot(x, n_cluster, 'rx-')
ax3.plot(x, svm_scores, 'gx-')
ax1.set_xlabel('Minimal cluster size')
ax1.yaxis.label.set_color('blue')
ax1.set_ylabel('Coherence score')
ax2.yaxis.label.set_color('red')
ax2.set_ylabel('Number of clusters')
ax3.yaxis.label.set_color('green')
ax3.set_ylabel('svm accuracy')
plt.title('Ellbow-method-like plot')
plt.savefig("TextClustering/plots/elbow_method/LDA_elbow_plot.png", dpi=300)
plt.show()
exit()
#%% train LDA-model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=words,
num_topics=args.k_value,
random_state=5,
update_every=1,
passes=10,
alpha='auto',
per_word_topics=True)
#%% get topic weights / features
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
topic_weights.append([w for i, w in row_list[0]])
# Array of topic weights
text_features = pd.DataFrame(topic_weights).fillna(0).values
#%% get prediction
predictedCluster= np.argmax(text_features, axis=1)
# and add it to the dataframe
df = pd.read_pickle(args.df_cases_file)
df['label_LDA'] = predictedCluster
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(text_features)
df['pcaX_LDA'] = reduced_features[:, 0]
df['pcaY_LDA'] = reduced_features[:, 1]
#%% and with umap
import umap
umap_text_features2D = umap.UMAP(n_neighbors=15,
n_components=2,
min_dist=0.0, metric='cosine').fit_transform(text_features)
df['umapX_LDA'] = umap_text_features2D[:, 0]
df['umapY_LDA'] = umap_text_features2D[:, 1]
df.to_pickle(args.df_cases_file)
#%% evalute the model
from TextClustering.utils_metrics import ClusterMetrics
evaluation = ClusterMetrics(text_features, predictedCluster,
file_name= "TextClustering/cluster_metrics/LDA_metrics.pkl")
evaluation.write_to_file()
# %% import section
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import pandas as pd
from database_preparation.preprocess import print_meta_data
from TextClassification.classification_for_cluster_evaluation import cross_validate_label_corpus_with_simple_SVM
import umap
from database_preparation.utils_stringpreparation import get_most_frequent_words
import numpy as np
from database_preparation.utils_labeled_datasets import is_text_lst_tokenized, is_text_lst_tfidf_vectorized
from TextClustering.argsparse_clustering_preamble import argsparse_preamble
import os
args = argsparse_preamble()
plot_real_diagnosis = False
def identity(word):
return word
# %% load the data
with open(args.path2corpus, 'rb') as f:
diag_lst = pickle.load(f)
text_is_vectorized = is_text_lst_tfidf_vectorized(args.path2corpus)
if not (is_text_lst_tokenized(args.path2corpus) or text_is_vectorized):
print("Error: " + args.path2corpus + ' has to be a list of tokenized texts or tfidf-vectorized texts!')
exit(1)
print_meta_data(args.path2corpus)
def create_vectorizer(data):
vec = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
vec = vec.fit_transform(data)
return vec
if text_is_vectorized:
text_features = diag_lst
else:
text_features = create_vectorizer(diag_lst)
# %% perform elbow-method to find good cluster number
if args.find_k_value:
Sum_of_squared_distances, svm_values = [], []
K = range(2, 23, 1)
for k in K:
print("iteration #" + str(k))
km = KMeans(n_clusters=k, max_iter=200, n_init=10)
km = km.fit(text_features)
predictedCluster_text_features = km.predict(text_features)
Sum_of_squared_distances.append(km.inertia_)
svm_values.append(
cross_validate_label_corpus_with_simple_SVM(predictedCluster_text_features, args.path2corpus, False))
fig, ax1 = plt.subplots()
#ax2 = ax1.twinx()
ax3 = ax1.twinx()
ax1.plot(K, Sum_of_squared_distances, 'bx-')
#ax2.plot(K, svm_values, 'rx-')
ax3.plot(K, svm_values, 'gx-')
ax1.set_xlabel('K')
ax1.yaxis.label.set_color('blue')
ax1.set_ylabel('Sum_of_squared_distances')
#ax2.yaxis.label.set_color('red')
#ax2.set_ylabel('Number of clusters')
ax3.yaxis.label.set_color('green')
ax3.set_ylabel('svm accuracy')
plt.title('Ellbow-method-like plot')
plt.savefig("TextClustering/plots/elbow_method/KMeans_elbow_plot.png", dpi=300)
plt.show()
exit()
km = KMeans(n_clusters=args.k_value, max_iter=200, n_init=10)
km = km.fit(text_features)
predictedCluster_text_features = km.predict(text_features)
umap_text_features2D = umap.UMAP(n_neighbors=15,
n_components=2,
min_dist=0.0, metric='cosine').fit_transform(text_features)
# save umaped vectors:
df = pd.read_pickle(args.df_cases_file)
df['umapX_KMeans'] = umap_text_features2D[:, 0]
df['umapY_KMeans'] = umap_text_features2D[:, 1]
df['label_KMeans'] = predictedCluster_text_features
df.to_pickle(args.df_cases_file)
clusters = km.labels_.tolist()
docs = {'text': diag_lst, 'cluster': clusters}
# %% generate topic words with GT:
if not text_is_vectorized:
frame = pd.DataFrame(docs, index=[clusters])
clusters = []
word_list_GT = []
n_words = 10
for cluster in range(0, args.k_value):
t_frame = frame[frame['cluster'] == cluster]
all_text = " ".join(t_frame['text'].astype(str))
top_words = get_most_frequent_words(all_text, n_words)
clusters.append(cluster)
word_list_GT.append(top_words)
for i in range(0, len(word_list_GT)):
t_token = np.array(word_list_GT[i])
if len(t_token) < n_words + 1:
t_token = np.append(t_token, np.repeat(np.nan, n_words - len(t_token)))
t_token = t_token.reshape((1, -1))
if i == 0:
token_list = t_token
else:
token_list = np.concatenate((token_list, t_token), axis=0)
pd.DataFrame(token_list).to_excel('TextClustering/tables/WordsPerCluster_kmeans.xlsx',
sheet_name="GT for kmeans")
# %% evalute the model by clustering metrics
from TextClustering.utils_metrics import ClusterMetrics
evaluation = ClusterMetrics(text_features, km.labels_.tolist(),
file_name="TextClustering/cluster_metrics/KMeans_metrics.pkl")
evaluation.write_to_file()
# %% argsparse preamble
import sys, os
sys.path.append(os.getcwd())
from TextClustering.argsparse_clustering_preamble import argsparse_preamble
args = argsparse_preamble()
from database_preparation.utils_labeled_datasets import is_text_lst_tokenized
if is_text_lst_tokenized(args.path2corpus):
print("Error: " + args.path2corpus + ' is tokenized! '
'Please pass texts list where each text is a single string!')
exit(1)
# %% prepare the background
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import umap
import hdbscan
from TextClustering.utils_metrics import ClusterMetrics
from sentence_transformers import SentenceTransformer
from database_preparation.preprocess import print_meta_data
embedding_backup_folder = "database/backup_files/"
if not os.path.isdir(embedding_backup_folder):
os.makedirs(embedding_backup_folder)
path_2_pathoBERT = "./LanguageModelling/ger-patho-bert-2"
with open(args.path2corpus, 'rb') as f:
diag_lst = pickle.load(f)
print_meta_data(args.path2corpus)
if args.do_embedding:
# %% load the model
if args.model2use == "German_BERT":
model = SentenceTransformer("Sahajtomar/German-semantic")
elif args.model2use == "Patho_BERT":
model = SentenceTransformer(path_2_pathoBERT)
# %% and apply the embedding-model to the text (only once, since very time-consuming)
if not 'embeddings' in locals():
embeddings = model.encode(diag_lst, show_progress_bar=True)
np.save(embedding_backup_folder + args.model2use + "_embeddingsBackup.npy", embeddings)
# %% load it (if not there)
if not 'embeddings' in locals():
embeddings = np.load(embedding_backup_folder + args.model2use + "_embeddingsBackup.npy")
# %% perform umap
umap_embeddings = umap.UMAP(n_neighbors=15,
n_components=5,
metric='cosine').fit_transform(embeddings)
# and perform umap-dimension-reduction for visulatizaton
umap_text_features2D = umap.UMAP(n_neighbors=15,
n_components=2,
min_dist=0.0, metric='cosine').fit_transform(embeddings)
if 'umap_embeddings' in locals():
np.save(embedding_backup_folder + args.model2use + "_umap_embeddingsBackup.npy", umap_embeddings)
# %% perform repetitive clustering to find the best min_cluster:size
if not 'umap_embeddings' in locals():
umap_embeddings = np.load(embedding_backup_folder + "umap_embeddingsBackup.npy")
if args.find_k_value:
cluster_size = range(5, 40, 2)
s_score, n_cluster = [], []
for i_cluster_size in cluster_size:
cluster = hdbscan.HDBSCAN(min_cluster_size=i_cluster_size,
metric='euclidean',
cluster_selection_method='eom').fit(umap_embeddings)
# and evaluate the results with several metrics (not needing ground truth)
from TextClustering.utils_metrics import ClusterMetrics
evaluation = ClusterMetrics(umap_embeddings, cluster.labels_.tolist())
s_score.append(evaluation.s_score)
n_cluster.append(len(np.unique(cluster.labels_.tolist())))
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(cluster_size, s_score, 'bx-')
ax2.plot(cluster_size, n_cluster, 'rx-')
ax1.set_xlabel('Minimal cluster size')
ax1.yaxis.label.set_color('blue')
ax1.set_ylabel('Silhouette Coefficient')
ax2.yaxis.label.set_color('red')
ax2.set_ylabel('Number of clusters')
plt.title('Ellbow-method-like plot')
plt.show()
sys.exit()
# %% perform hdbscan-clustering
if not 'umap_embeddings' in locals():
umap_embeddings = np.load(embedding_backup_folder + "umap_embeddingsBackup.npy")
cluster = hdbscan.HDBSCAN(min_cluster_size=args.k_value,
metric='euclidean',
cluster_selection_method='eom').fit(umap_embeddings)
# and print the results
result = pd.DataFrame(umap_text_features2D, columns=['x', 'y'])
result['labels'] = cluster.labels_.tolist() # cluster.labels_
print("cluster indices: " + str(np.unique(result.labels)))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
print(str(len(outliers.x)) + " outliers")
# save umaped vectors:
df = pd.read_pickle(args.df_cases_file)
df['umapX_' + args.model2use] = result.x
df['umapY_' + args.model2use] = result.y
# %% update df_cases
df['label_' + args.model2use] = result.labels
df.to_pickle(args.df_cases_file)
# %% and evaluate the results with several metrics (not needing ground truth)
evaluation = ClusterMetrics(umap_embeddings[result.labels >= 0,], clustered.labels.tolist(),
file_name="TextClustering/cluster_metrics/" + args.model2use + "_metrics.pkl")
evaluation.write_to_file()
# Author: Dimo Angelov
#
# License: BSD 3 clause
import logging
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
import umap
import hdbscan
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.cluster import dbscan
import tempfile
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from scipy.special import softmax
try:
import hnswlib
_HAVE_HNSWLIB = True
except ImportError:
_HAVE_HNSWLIB = False
try:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
_HAVE_TENSORFLOW = True
except ImportError:
_HAVE_TENSORFLOW = False
try:
from sentence_transformers import SentenceTransformer
_HAVE_TORCH = True
except ImportError:
_HAVE_TORCH = False
logger = logging.getLogger('top2vec')
logger.setLevel(logging.WARNING)
sh = logging.StreamHandler()
sh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(sh)
def default_tokenizer(doc):
"""Tokenize documents for training and remove too long/short words"""
return simple_preprocess(strip_tags(doc), deacc=True)
class Top2Vec:
"""
Top2Vec
Creates jointly embedded topic, document and word vectors.
Parameters
----------
embedding_model: string
This will determine which model is used to generate the document and
word embeddings. The valid string options are:
* doc2vec
* universal-sentence-encoder
* universal-sentence-encoder-multilingual
* distiluse-base-multilingual-cased
For large database_preparation sets and database_preparation sets with very unique vocabulary doc2vec
could produce better results. This will train a doc2vec model from
scratch. This method is language agnostic. However multiple languages
will not be aligned.
Using the universal sentence encoder options will be much faster since
those are pre-trained and efficient models. The universal sentence
encoder options are suggested for smaller database_preparation sets. They are also
good options for large database_preparation sets that are in English or in languages
covered by the multilingual model. It is also suggested for database_preparation sets
that are multilingual.
For more information on universal-sentence-encoder visit:
https://tfhub.dev/google/universal-sentence-encoder/4
For more information on universal-sentence-encoder-multilingual visit:
https://tfhub.dev/google/universal-sentence-encoder-multilingual/3
The distiluse-base-multilingual-cased pre-trained sentence transformer
is suggested for multilingual datasets and languages that are not
covered by the multilingual universal sentence encoder. The
transformer is significantly slower than the universal sentence
encoder options.
For more informati ond istiluse-base-multilingual-cased visit:
https://www.sbert.net/docs/pretrained_models.html
embedding_model_path: string (Optional)
Pre-trained embedding models will be downloaded automatically by
default. However they can also be uploaded from a file that is in the
location of embedding_model_path.
Warning: the model at embedding_model_path must match the
embedding_model parameter type.
documents: List of str
Input corpus, should be a list of strings.
min_count: int (Optional, default 50)
Ignores all words with total frequency lower than this. For smaller
corpora a smaller min_count will be necessary.
speed: string (Optional, default 'learn')
This parameter is only used when using doc2vec as embedding_model.
It will determine how fast the model takes to train. The
fast-learn option is the fastest and will generate the lowest quality
vectors. The learn option will learn better quality vectors but take
a longer time to train. The deep-learn option will learn the best
quality vectors but will take significant time to train. The valid
string speed options are:
* fast-learn
* learn
* deep-learn
use_corpus_file: bool (Optional, default False)
This parameter is only used when using doc2vec as embedding_model.
Setting use_corpus_file to True can sometimes provide speedup for
large datasets when multiple worker threads are available. Documents
are still passed to the model as a list of str, the model will create
a temporary corpus file for training.
document_ids: List of str, int (Optional)
A unique value per document that will be used for referring to
documents in search results. If ids are not given to the model, the
index of each document in the original corpus will become the id.
keep_documents: bool (Optional, default True)
If set to False documents will only be used for training and not saved
as part of the model. This will reduce model size. When using search
functions only document ids will be returned, not the actual
documents.
workers: int (Optional)
The amount of worker threads to be used in training the model. Larger
amount will lead to faster training.
tokenizer: callable (Optional, default None)
Override the default tokenization method. If None then
gensim.utils.simple_preprocess will be used.
use_embedding_model_tokenizer: bool (Optional, default False)
If using an embedding model other than doc2vec, use the model's
tokenizer for document embedding. If set to True the tokenizer, either
default or passed callable will be used to tokenize the text to
extract the vocabulary for word embedding.
umap_args: dict (Optional, default None)
Pass custom arguments to UMAP.
hdbscan_args: dict (Optional, default None)
Pass custom arguments to HDBSCAN.
verbose: bool (Optional, default True)
Whether to print status database_preparation during training.
"""
def __init__(self,
documents,
min_count=50,
embedding_model='doc2vec',
embedding_model_path=None,
speed='learn',
use_corpus_file=False,
document_ids=None,
keep_documents=True,
workers=None,
tokenizer=None,
use_embedding_model_tokenizer=False,
umap_args=None,
hdbscan_args=None,
verbose=True
):
if verbose:
logger.setLevel(logging.DEBUG)
self.verbose = True
else:
logger.setLevel(logging.WARNING)
self.verbose = False
if tokenizer is None:
tokenizer = default_tokenizer
# validate documents
if not (isinstance(documents, list) or isinstance(documents, np.ndarray)):
raise ValueError("Documents need to be a list of strings")
if not all((isinstance(doc, str) or isinstance(doc, np.str_)) for doc in documents):
raise ValueError("Documents need to be a list of strings")
if keep_documents:
self.documents = np.array(documents, dtype="object")
else:
self.documents = None
# validate document ids
if document_ids is not None:
if not (isinstance(document_ids, list) or isinstance(document_ids, np.ndarray)):
raise ValueError("Documents ids need to be a list of str or int")
if len(documents) != len(document_ids):
raise ValueError("Document ids need to match number of documents")
elif len(document_ids) != len(set(document_ids)):
raise ValueError("Document ids need to be unique")
if all((isinstance(doc_id, str) or isinstance(doc_id, np.str_)) for doc_id in document_ids):
self.doc_id_type = np.str_
elif all((isinstance(doc_id, int) or isinstance(doc_id, np.int_)) for doc_id in document_ids):
self.doc_id_type = np.int_
else:
raise ValueError("Document ids need to be str or int")
self.document_ids_provided = True
self.document_ids = np.array(document_ids)
self.doc_id2index = dict(zip(document_ids, list(range(0, len(document_ids)))))
else:
self.document_ids_provided = False
self.document_ids = np.array(range(0, len(documents)))
self.doc_id2index = dict(zip(self.document_ids, list(range(0, len(self.document_ids)))))
self.doc_id_type = np.int_
acceptable_embedding_models = ["universal-sentence-encoder-multilingual",
"universal-sentence-encoder",
"distiluse-base-multilingual-cased"]
self.embedding_model_path = embedding_model_path
if embedding_model == 'doc2vec':
# validate training inputs
if speed == "fast-learn":
hs = 0
negative = 5
epochs = 40
elif speed == "learn":
hs = 1
negative = 0
epochs = 40
elif speed == "deep-learn":
hs = 1
negative = 0
epochs = 400
elif speed == "test-learn":
hs = 0
negative = 5
epochs = 1
else:
raise ValueError("speed parameter needs to be one of: fast-learn, learn or deep-learn")
if workers is None:
pass
elif isinstance(workers, int):
pass
else:
raise ValueError("workers needs to be an int")
doc2vec_args = {"vector_size": 300,
"min_count": min_count,
"window": 15,
"sample": 1e-5,
"negative": negative,
"hs": hs,
"epochs": epochs,
"dm": 0,
"dbow_words": 1}
if workers is not None:
doc2vec_args["workers"] = workers
logger.info('Pre-processing documents for training')
if use_corpus_file:
processed = [' '.join(tokenizer(doc)) for doc in documents]
lines = "\n".join(processed)
temp = tempfile.NamedTemporaryFile(mode='w+t')
temp.write(lines)
doc2vec_args["corpus_file"] = temp.name
else:
train_corpus = [TaggedDocument(tokenizer(doc), [i]) for i, doc in enumerate(documents)]
doc2vec_args["documents"] = train_corpus
logger.info('Creating joint document/word embedding')
self.embedding_model = 'doc2vec'
self.model = Doc2Vec(**doc2vec_args)
if use_corpus_file:
temp.close()
elif embedding_model in acceptable_embedding_models:
self.embed = None
self.embedding_model = embedding_model
self._check_import_status()
logger.info('Pre-processing documents for training')
# preprocess documents
tokenized_corpus = [tokenizer(doc) for doc in documents]
def return_doc(doc):
return doc
# preprocess vocabulary
vectorizer = CountVectorizer(tokenizer=return_doc, preprocessor=return_doc)
doc_word_counts = vectorizer.fit_transform(tokenized_corpus)
words = vectorizer.get_feature_names()
word_counts = np.array(np.sum(doc_word_counts, axis=0).tolist()[0])
vocab_inds = np.where(word_counts > min_count)[0]
if len(vocab_inds) == 0:
raise ValueError(f"A min_count of {min_count} results in "
f"all words being ignored, choose a lower value.")
self.vocab = [words[ind] for ind in vocab_inds]
self._check_model_status()
logger.info('Creating joint document/word embedding')
# embed words
self.word_indexes = dict(zip(self.vocab, range(len(self.vocab))))
self.word_vectors = self._l2_normalize(np.array(self.embed(self.vocab)))
# embed documents
if use_embedding_model_tokenizer:
self.document_vectors = self._embed_documents(documents)
else:
train_corpus = [' '.join(tokens) for tokens in tokenized_corpus]
self.document_vectors = self._embed_documents(train_corpus)
else:
raise ValueError(f"{embedding_model} is an invalid embedding model.")
# create 5D embeddings of documents
logger.info('Creating lower dimension embedding of documents')
if umap_args is None:
umap_args = {'n_neighbors': 15,
'n_components': 5,
'metric': 'cosine'}
self.umap_model = umap.UMAP(**umap_args).fit(self._get_document_vectors(norm=False))
# find dense areas of document vectors
logger.info('Finding dense areas of documents')
if hdbscan_args is None:
hdbscan_args = {'min_cluster_size': 15,
'metric': 'euclidean',
'cluster_selection_method': 'eom'}
cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(self.umap_model.embedding_)
# calculate values for printing
self.umap_model_2D = umap.UMAP(n_neighbors=15,
n_components=2,
min_dist=0.0, metric='cosine').fit_transform(self._get_document_vectors(norm=False))
self.result = pd.DataFrame(self.umap_model_2D, columns=['x', 'y'])
self.result['labels'] = cluster.labels_.tolist()
self.outliers = self.result.loc[self.result.labels == -1, :]
self.clustered = self.result.loc[self.result.labels != -1, :]
# calculate topic vectors from dense areas of documents
logger.info('Finding topics')
# create topic vectors
self._create_topic_vectors(cluster.labels_)
# deduplicate topics
self._deduplicate_topics()
# find topic words and scores
self.topic_words, self.topic_word_scores = self._find_topic_words_and_scores(topic_vectors=self.topic_vectors)
# assign documents to topic
self.doc_top, self.doc_dist = self._calculate_documents_topic(self.topic_vectors,
self._get_document_vectors())
# calculate topic sizes
self.topic_sizes = self._calculate_topic_sizes(hierarchy=False)
# re-order topics
self._reorder_topics(hierarchy=False)
# initialize variables for hierarchical topic reduction
self.topic_vectors_reduced = None
self.doc_top_reduced = None
self.doc_dist_reduced = None
self.topic_sizes_reduced = None
self.topic_words_reduced = None
self.topic_word_scores_reduced = None
self.hierarchy = None
# initialize document indexing variables
self.document_index = None
self.serialized_document_index = None
self.documents_indexed = False
self.index_id2doc_id = None
self.doc_id2index_id = None
# initialize word indexing variables
self.word_index = None
self.serialized_word_index = None
self.words_indexed = False
def save(self, file):
"""
Saves the current model to the specified file.
Parameters
----------
file: str
File where model will be saved.
"""
document_index_temp = None
word_index_temp = None
# do not save sentence encoders and sentence transformers
if self.embedding_model != "doc2vec":
self.embed = None
# serialize document index so that it can be saved
if self.documents_indexed:
temp = tempfile.NamedTemporaryFile(mode='w+b')
self.document_index.save_index(temp.name)
self.serialized_document_index = temp.read()
temp.close()
document_index_temp = self.document_index
self.document_index = None
# serialize word index so that it can be saved
if self.words_indexed:
temp = tempfile.NamedTemporaryFile(mode='w+b')
self.word_index.save_index(temp.name)
self.serialized_word_index = temp.read()
temp.close()
word_index_temp = self.word_index
self.word_index = None
dump(self, file)
self.document_index = document_index_temp
self.word_index = word_index_temp
@classmethod
def load(cls, file):
"""
Load a pre-trained model from the specified file.
Parameters
----------
file: str
File where model will be loaded from.
"""
top2vec_model = load(file)
# load document index
if top2vec_model.documents_indexed:
if not _HAVE_HNSWLIB:
raise ImportError(f"Cannot load document index.\n\n"
"Try: pip install top2vec[indexing]\n\n"
"Alternatively try: pip install hnswlib")
temp = tempfile.NamedTemporaryFile(mode='w+b')
temp.write(top2vec_model.serialized_document_index)
if top2vec_model.embedding_model == 'doc2vec':
document_vectors = top2vec_model.model.dv.vectors
else:
document_vectors = top2vec_model.document_vectors
top2vec_model.document_index = hnswlib.Index(space='ip',
dim=document_vectors.shape[1])
top2vec_model.document_index.load_index(temp.name, max_elements=document_vectors.shape[0])
temp.close()
top2vec_model.serialized_document_index = None
# load word index
if top2vec_model.words_indexed:
if not _HAVE_HNSWLIB:
raise ImportError(f"Cannot load word index.\n\n"
"Try: pip install top2vec[indexing]\n\n"
"Alternatively try: pip install hnswlib")
temp = tempfile.NamedTemporaryFile(mode='w+b')
temp.write(top2vec_model.serialized_word_index)
if top2vec_model.embedding_model == 'doc2vec':
word_vectors = top2vec_model.model.wv.vectors
else:
word_vectors = top2vec_model.word_vectors
top2vec_model.word_index = hnswlib.Index(space='ip',
dim=word_vectors.shape[1])
top2vec_model.word_index.load_index(temp.name, max_elements=word_vectors.shape[0])
temp.close()
top2vec_model.serialized_word_index = None
return top2vec_model
@staticmethod
def _l2_normalize(vectors):
if vectors.ndim == 2:
return normalize(vectors)
else:
return normalize(vectors.reshape(1, -1))[0]
def _embed_documents(self, train_corpus):
self._check_import_status()
self._check_model_status()
# embed documents
batch_size = 500
document_vectors = []
current = 0
batches = int(len(train_corpus) / batch_size)
extra = len(train_corpus) % batch_size
for ind in range(0, batches):
document_vectors.append(self.embed(train_corpus[current:current + batch_size]))
current += batch_size
if extra > 0:
document_vectors.append(self.embed(train_corpus[current:current + extra]))
document_vectors = self._l2_normalize(np.array(np.vstack(document_vectors)))
return document_vectors
def _embed_query(self, query):
self._check_import_status()
self._check_model_status()
return self._l2_normalize(np.array(self.embed([query])[0]))
def _set_document_vectors(self, document_vectors):
if self.embedding_model == 'doc2vec':
self.model.dv.vectors = document_vectors
else:
self.document_vectors = document_vectors
def _get_document_vectors(self, norm=True):
if self.embedding_model == 'doc2vec':
if norm:
self.model.dv.init_sims()
return self.model.dv.get_normed_vectors()
else:
return self.model.dv.vectors
else:
return self.document_vectors
def _index2word(self, index):
if self.embedding_model == 'doc2vec':
return self.model.wv.index_to_key[index]
else:
return self.vocab[index]
def _get_word_vectors(self):
if self.embedding_model == 'doc2vec':
self.model.wv.init_sims()
return self.model.wv.get_normed_vectors()
else:
return self.word_vectors
def _create_topic_vectors(self, cluster_labels):
unique_labels = set(cluster_labels)
if -1 in unique_labels:
unique_labels.remove(-1)
self.topic_vectors = self._l2_normalize(
np.vstack([self._get_document_vectors(norm=False)[np.where(cluster_labels == label)[0]]
.mean(axis=0) for label in unique_labels]))
def _deduplicate_topics(self):
core_samples, labels = dbscan(X=self.topic_vectors,
eps=0.1,
min_samples=2,
metric="cosine")
duplicate_clusters = set(labels)
if len(duplicate_clusters) > 1 or -1 not in duplicate_clusters:
# unique topics
unique_topics = self.topic_vectors[np.where(labels == -1)[0]]
if -1 in duplicate_clusters:
duplicate_clusters.remove(-1)
# merge duplicate topics
for unique_label in duplicate_clusters:
unique_topics = np.vstack(
[unique_topics, self._l2_normalize(self.topic_vectors[np.where(labels == unique_label)[0]]
.mean(axis=0))])
self.topic_vectors = unique_topics
def _calculate_topic_sizes(self, hierarchy=False):
if hierarchy:
topic_sizes = pd.Series(self.doc_top_reduced).value_counts()
else:
topic_sizes = pd.Series(self.doc_top).value_counts()
return topic_sizes
def _reorder_topics(self, hierarchy=False):
if hierarchy:
self.topic_vectors_reduced = self.topic_vectors_reduced[self.topic_sizes_reduced.index]
self.topic_words_reduced = self.topic_words_reduced[self.topic_sizes_reduced.index]
self.topic_word_scores_reduced = self.topic_word_scores_reduced[self.topic_sizes_reduced.index]
old2new = dict(zip(self.topic_sizes_reduced.index, range(self.topic_sizes_reduced.index.shape[0])))
self.doc_top_reduced = np.array([old2new[i] for i in self.doc_top_reduced])
self.hierarchy = [self.hierarchy[i] for i in self.topic_sizes_reduced.index]
self.topic_sizes_reduced.reset_index(drop=True, inplace=True)
else:
self.topic_vectors = self.topic_vectors[self.topic_sizes.index]
self.topic_words = self.topic_words[self.topic_sizes.index]
self.topic_word_scores = self.topic_word_scores[self.topic_sizes.index]
old2new = dict(zip(self.topic_sizes.index, range(self.topic_sizes.index.shape[0])))
self.doc_top = np.array([old2new[i] for i in self.doc_top])
self.topic_sizes.reset_index(drop=True, inplace=True)
@staticmethod
def _calculate_documents_topic(topic_vectors, document_vectors, dist=True, num_topics=None):
batch_size = 10000
doc_top = []
if dist:
doc_dist = []
if document_vectors.shape[0] > batch_size:
current = 0
batches = int(document_vectors.shape[0] / batch_size)
extra = document_vectors.shape[0] % batch_size
for ind in range(0, batches):
res = np.inner(document_vectors[current:current + batch_size], topic_vectors)
if num_topics is None:
doc_top.extend(np.argmax(res, axis=1))
if dist:
doc_dist.extend(np.max(res, axis=1))
else:
doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
if dist:
doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])
current += batch_size
if extra > 0:
res = np.inner(document_vectors[current:current + extra], topic_vectors)
if num_topics is None:
doc_top.extend(np.argmax(res, axis=1))
if dist:
doc_dist.extend(np.max(res, axis=1))
else:
doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
if dist:
doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])
if dist:
doc_dist = np.array(doc_dist)
else:
res = np.inner(document_vectors, topic_vectors)
if num_topics is None:
doc_top = np.argmax(res, axis=1)
if dist:
doc_dist = np.max(res, axis=1)
else:
doc_top.extend(np.flip(np.argsort(res), axis=1)[:, :num_topics])
if dist:
doc_dist.extend(np.flip(np.sort(res), axis=1)[:, :num_topics])
if num_topics is not None:
doc_top = np.array(doc_top)
if dist:
doc_dist = np.array(doc_dist)
if dist:
return doc_top, doc_dist
else:
return doc_top
def _find_topic_words_and_scores(self, topic_vectors):
topic_words = []
topic_word_scores = []
res = np.inner(topic_vectors, self._get_word_vectors())
top_words = np.flip(np.argsort(res, axis=1), axis=1)
top_scores = np.flip(np.sort(res, axis=1), axis=1)
for words, scores in zip(top_words, top_scores):
topic_words.append([self._index2word(i) for i in words[0:50]])
topic_word_scores.append(scores[0:50])
topic_words = np.array(topic_words)
topic_word_scores = np.array(topic_word_scores)
return topic_words, topic_word_scores
def _assign_documents_to_topic(self, document_vectors, hierarchy=False):
if hierarchy:
doc_top_new, doc_dist_new = self._calculate_documents_topic(self.topic_vectors_reduced,
document_vectors,
dist=True)
self.doc_top_reduced = np.append(self.doc_top_reduced, doc_top_new)
self.doc_dist_reduced = np.append(self.doc_dist_reduced, doc_dist_new)
topic_sizes_new = pd.Series(doc_top_new).value_counts()
for top in topic_sizes_new.index.tolist():
self.topic_sizes_reduced[top] += topic_sizes_new[top]
self.topic_sizes_reduced.sort_values(ascending=False, inplace=True)
self._reorder_topics(hierarchy)
else:
doc_top_new, doc_dist_new = self._calculate_documents_topic(self.topic_vectors, document_vectors, dist=True)
self.doc_top = np.append(self.doc_top, doc_top_new)
self.doc_dist = np.append(self.doc_dist, doc_dist_new)
topic_sizes_new = pd.Series(doc_top_new).value_counts()
for top in topic_sizes_new.index.tolist():
self.topic_sizes[top] += topic_sizes_new[top]
self.topic_sizes.sort_values(ascending=False, inplace=True)
self._reorder_topics(hierarchy)
def _unassign_documents_from_topic(self, doc_indexes, hierarchy=False):
if hierarchy:
doc_top_remove = self.doc_top_reduced[doc_indexes]
self.doc_top_reduced = np.delete(self.doc_top_reduced, doc_indexes, 0)
self.doc_dist_reduced = np.delete(self.doc_dist_reduced, doc_indexes, 0)
topic_sizes_remove = pd.Series(doc_top_remove).value_counts()
for top in topic_sizes_remove.index.tolist():
self.topic_sizes_reduced[top] -= topic_sizes_remove[top]
self.topic_sizes_reduced.sort_values(ascending=False, inplace=True)
self._reorder_topics(hierarchy)
else:
doc_top_remove = self.doc_top[doc_indexes]
self.doc_top = np.delete(self.doc_top, doc_indexes, 0)
self.doc_dist = np.delete(self.doc_dist, doc_indexes, 0)
topic_sizes_remove = pd.Series(doc_top_remove).value_counts()
for top in topic_sizes_remove.index.tolist():
self.topic_sizes[top] -= topic_sizes_remove[top]
self.topic_sizes.sort_values(ascending=False, inplace=True)
self._reorder_topics(hierarchy)
def _get_document_ids(self, doc_index):
return self.document_ids[doc_index]
def _get_document_indexes(self, doc_ids):
if self.document_ids is None:
return doc_ids
else:
return [self.doc_id2index[doc_id] for doc_id in doc_ids]
def _words2word_vectors(self, keywords):
return self._get_word_vectors()[[self._word2index(word) for word in keywords]]
def _word2index(self, word):
if self.embedding_model == 'doc2vec':
return self.model.wv.vocab[word].index
else:
return self.word_indexes[word]
def _get_combined_vec(self, vecs, vecs_neg):
combined_vector = np.zeros(self._get_document_vectors().shape[1], dtype=np.float64)
for vec in vecs:
combined_vector += vec
for vec in vecs_neg:
combined_vector -= vec
combined_vector /= (len(vecs) + len(vecs_neg))
combined_vector = self._l2_normalize(combined_vector)
return combined_vector
@staticmethod
def _search_vectors_by_vector(vectors, vector, num_res):
ranks = np.inner(vectors, vector)
indexes = np.flip(np.argsort(ranks)[-num_res:])
scores = np.array([ranks[res] for res in indexes])
return indexes, scores
@staticmethod
def _check_hnswlib_status():
if not _HAVE_HNSWLIB:
raise ImportError(f"Indexing is not available.\n\n"
"Try: pip install top2vec[indexing]\n\n"
"Alternatively try: pip install hnswlib")
def _check_document_index_status(self):
if self.document_index is None:
raise ImportError("There is no document index.\n\n"
"Call index_document_vectors method before setting use_index=True.")
def _check_word_index_status(self):
if self.word_index is None:
raise ImportError("There is no word index.\n\n"
"Call index_word_vectors method before setting use_index=True.")
def _check_import_status(self):
if self.embedding_model != 'distiluse-base-multilingual-cased':
if not _HAVE_TENSORFLOW:
raise ImportError(f"{self.embedding_model} is not available.\n\n"
"Try: pip install top2vec[sentence_encoders]\n\n"
"Alternatively try: pip install tensorflow tensorflow_hub tensorflow_text")
else:
if not _HAVE_TORCH:
raise ImportError(f"{self.embedding_model} is not available.\n\n"
"Try: pip install top2vec[sentence_transformers]\n\n"
"Alternatively try: pip install torch sentence_transformers")
def _check_model_status(self):
if self.embed is None:
if self.verbose is False:
logger.setLevel(logging.DEBUG)
if self.embedding_model != "distiluse-base-multilingual-cased":
if self.embedding_model_path is None:
logger.info(f'Downloading {self.embedding_model} model')
if self.embedding_model == "universal-sentence-encoder-multilingual":
module = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
else:
module = "https://tfhub.dev/google/universal-sentence-encoder/4"
else:
logger.info(f'Loading {self.embedding_model} model at {self.embedding_model_path}')
module = self.embedding_model_path
self.embed = hub.load(module)
else:
if self.embedding_model_path is None:
logger.info(f'Downloading {self.embedding_model} model')
module = 'distiluse-base-multilingual-cased'
else:
logger.info(f'Loading {self.embedding_model} model at {self.embedding_model_path}')
module = self.embedding_model_path
model = SentenceTransformer(module)
self.embed = model.encode
if self.verbose is False:
logger.setLevel(logging.WARNING)
@staticmethod
def _less_than_zero(num, var_name):
if num < 0:
raise ValueError(f"{var_name} cannot be less than 0.")
def _validate_hierarchical_reduction(self):
if self.hierarchy is None:
raise ValueError("Hierarchical topic reduction has not been performed.")
def _validate_hierarchical_reduction_num_topics(self, num_topics):
current_num_topics = len(self.topic_vectors)
if num_topics >= current_num_topics:
raise ValueError(f"Number of topics must be less than {current_num_topics}.")
def _validate_num_docs(self, num_docs):
self._less_than_zero(num_docs, "num_docs")
document_count = len(self.doc_top)
if num_docs > document_count:
raise ValueError(f"num_docs cannot exceed the number of documents: {document_count}.")
def _validate_num_topics(self, num_topics, reduced):
self._less_than_zero(num_topics, "num_topics")
if reduced:
topic_count = len(self.topic_vectors_reduced)
if num_topics > topic_count:
raise ValueError(f"num_topics cannot exceed the number of reduced topics: {topic_count}.")
else:
topic_count = len(self.topic_vectors)
if num_topics > topic_count:
raise ValueError(f"num_topics cannot exceed the number of topics: {topic_count}.")
def _validate_topic_num(self, topic_num, reduced):
self._less_than_zero(topic_num, "topic_num")
if reduced:
topic_count = len(self.topic_vectors_reduced) - 1
if topic_num > topic_count:
raise ValueError(f"Invalid topic number: valid reduced topics numbers are 0 to {topic_count}.")
else:
topic_count = len(self.topic_vectors) - 1
if topic_num > topic_count:
raise ValueError(f"Invalid topic number: valid original topics numbers are 0 to {topic_count}.")
def _validate_topic_search(self, topic_num, num_docs, reduced):
self._less_than_zero(num_docs, "num_docs")
if reduced:
if num_docs > self.topic_sizes_reduced[topic_num]:
raise ValueError(f"Invalid number of documents: reduced topic {topic_num}"
f" only has {self.topic_sizes_reduced[topic_num]} documents.")
else:
if num_docs > self.topic_sizes[topic_num]:
raise ValueError(f"Invalid number of documents: original topic {topic_num}"
f" only has {self.topic_sizes[topic_num]} documents.")
def _validate_doc_ids(self, doc_ids, doc_ids_neg):
if not (isinstance(doc_ids, list) or isinstance(doc_ids, np.ndarray)):
raise ValueError("doc_ids must be a list of string or int.")
if not (isinstance(doc_ids_neg, list) or isinstance(doc_ids_neg, np.ndarray)):
raise ValueError("doc_ids_neg must be a list of string or int.")
if isinstance(doc_ids, np.ndarray):
doc_ids = list(doc_ids)
if isinstance(doc_ids_neg, np.ndarray):
doc_ids_neg = list(doc_ids_neg)
doc_ids_all = doc_ids + doc_ids_neg
if self.document_ids is not None:
for doc_id in doc_ids_all:
if doc_id not in self.doc_id2index:
raise ValueError(f"{doc_id} is not a valid document id.")
elif min(doc_ids) < 0:
raise ValueError(f"{min(doc_ids)} is not a valid document id.")
elif max(doc_ids) > len(self.doc_top) - 1:
raise ValueError(f"{max(doc_ids)} is not a valid document id.")
def _validate_keywords(self, keywords, keywords_neg):
if not (isinstance(keywords, list) or isinstance(keywords, np.ndarray)):
raise ValueError("keywords must be a list of strings.")
if not (isinstance(keywords_neg, list) or isinstance(keywords_neg, np.ndarray)):
raise ValueError("keywords_neg must be a list of strings.")
keywords_lower = [keyword.lower() for keyword in keywords]
keywords_neg_lower = [keyword.lower() for keyword in keywords_neg]
if self.embedding_model == 'doc2vec':
vocab = self.model.wv.vocab
else:
vocab = self.vocab
for word in keywords_lower + keywords_neg_lower:
if word not in vocab:
raise ValueError(f"'{word}' has not been learned by the model so it cannot be searched.")
return keywords_lower, keywords_neg_lower
def _validate_document_ids_add_doc(self, documents, document_ids):
if document_ids is None:
raise ValueError("Document ids need to be provided.")
if len(documents) != len(document_ids):
raise ValueError("Document ids need to match number of documents.")
if len(document_ids) != len(set(document_ids)):
raise ValueError("Document ids need to be unique.")
if len(set(document_ids).intersection(self.document_ids)) > 0:
raise ValueError("Some document ids already exist in model.")
if self.doc_id_type == np.str_:
if not all((isinstance(doc_id, str) or isinstance(doc_id, np.str_)) for doc_id in document_ids):
raise ValueError("Document ids need to be of type str.")
if self.doc_id_type == np.int_:
if not all((isinstance(doc_id, int) or isinstance(doc_id, np.int_)) for doc_id in document_ids):
raise ValueError("Document ids need to be of type int.")
@staticmethod
def _validate_documents(documents):
if not all((isinstance(doc, str) or isinstance(doc, np.str_)) for doc in documents):
raise ValueError("Documents need to be a list of strings.")
@staticmethod
def _validate_query(query):
if not isinstance(query, str) or isinstance(query, np.str_):
raise ValueError("Query needs to be a string.")
def _validate_vector(self, vector):
if not isinstance(vector, np.ndarray):
raise ValueError("Vector needs to be a numpy array.")
vec_size = self._get_document_vectors().shape[1]
if not vector.shape[0] == vec_size:
raise ValueError(f"Vector needs to be of {vec_size} dimensions.")
def index_document_vectors(self, ef_construction=200, M=64):
"""
Creates an index of the document vectors using hnswlib. This will
lead to faster search times for models with a large number of
documents.
For more information on hnswlib see: https://github.com/nmslib/hnswlib
Parameters
----------
ef_construction: int (Optional default 200)
This parameter controls the trade-off between index construction
time and index accuracy. Larger values will lead to greater
accuracy but will take longer to construct.
M: int (Optional default 64)
This parameter controls the trade-off between both index size as
well as construction time and accuracy. Larger values will lead to
greater accuracy but will result in a larger index as well as
longer construction time.
For more information on the parameters see:
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
"""
self._check_hnswlib_status()
document_vectors = self._get_document_vectors()
vec_dim = document_vectors.shape[1]
num_vecs = document_vectors.shape[0]
index_ids = list(range(0, len(self.document_ids)))
self.index_id2doc_id = dict(zip(index_ids, self.document_ids))
self.doc_id2index_id = dict(zip(self.document_ids, index_ids))
self.document_index = hnswlib.Index(space='ip', dim=vec_dim)
self.document_index.init_index(max_elements=num_vecs, ef_construction=ef_construction, M=M)
self.document_index.add_items(document_vectors, index_ids)
self.documents_indexed = True
def index_word_vectors(self, ef_construction=200, M=64):
"""
Creates an index of the word vectors using hnswlib. This will
lead to faster search times for models with a large number of
words.
For more information on hnswlib see: https://github.com/nmslib/hnswlib
Parameters
----------
ef_construction: int (Optional default 200)
This parameter controls the trade-off between index construction
time and index accuracy. Larger values will lead to greater
accuracy but will take longer to construct.
M: int (Optional default 64)
This parameter controls the trade-off between both index size as
well as construction time and accuracy. Larger values will lead to
greater accuracy but will result in a larger index as well as
longer construction time.
For more information on the parameters see:
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
"""
self._check_hnswlib_status()
word_vectors = self._get_word_vectors()
vec_dim = word_vectors.shape[1]
num_vecs = word_vectors.shape[0]
index_ids = list(range(0, num_vecs))
self.word_index = hnswlib.Index(space='ip', dim=vec_dim)
self.word_index.init_index(max_elements=num_vecs, ef_construction=ef_construction, M=M)
self.word_index.add_items(word_vectors, index_ids)
self.words_indexed = True
def update_embedding_model_path(self, embedding_model_path):
"""
Update the path of the embedding model to be loaded. The model will
no longer be downloaded but loaded from the path location.
Warning: the model at embedding_model_path must match the
embedding_model parameter type.
Parameters
----------
embedding_model_path: Str
Path to downloaded embedding model.
"""
self.embedding_model_path = embedding_model_path
def change_to_download_embedding_model(self):
"""
Use automatic download to load embedding model used for training.
Top2Vec will no longer try and load the embedding model from a file
if a embedding_model path was previously added.
"""
self.embedding_model_path = None
def get_documents_topics(self, doc_ids, reduced=False, num_topics=1):
"""
Get document topics.
The topic of each document will be returned.
The corresponding original topics are returned unless reduced=True,
in which case the reduced topics will be returned.
Parameters
----------
doc_ids: List of str, int
A unique value per document that is used for referring to
documents in search results. If ids were not given to the model,
the index of each document in the model is the id.
reduced: bool (Optional, default False)
Original topics are returned by default. If True the
reduced topics will be returned.
num_topics: int (Optional, default 1)
The number of topics to return per document.
Returns
-------
topic_nums: array of int, shape(len(doc_ids), num_topics)
The topic number(s) of the document corresponding to each doc_id.
topic_score: array of float, shape(len(doc_ids), num_topics)
Semantic similarity of document to topic(s). The cosine similarity
of the document and topic vector.
topics_words: array of shape(len(doc_ids), num_topics, 50)
For each topic the top 50 words are returned, in order
of semantic similarity to topic.
Example:
[['database_preparation', 'deep', 'learning' ... 'artificial'], <Topic 4>
['environment', 'warming', 'climate ... 'temperature'] <Topic 21>
...]
word_scores: array of shape(num_topics, 50)
For each topic the cosine similarity scores of the
top 50 words to the topic are returned.
Example:
[[0.7132, 0.6473, 0.5700 ... 0.3455], <Topic 4>
[0.7818', 0.7671, 0.7603 ... 0.6769] <Topic 21>
...]
"""
if reduced:
self._validate_hierarchical_reduction()
# make sure documents exist
self._validate_doc_ids(doc_ids, doc_ids_neg=[])
# get document indexes from ids
doc_indexes = self._get_document_indexes(doc_ids)
if num_topics == 1:
if reduced:
doc_topics = self.doc_top_reduced[doc_indexes]
doc_dist = self.doc_dist_reduced[doc_indexes]
topic_words = self.topic_words_reduced[doc_topics]
topic_word_scores = self.topic_word_scores_reduced[doc_topics]
else:
doc_topics = self.doc_top[doc_indexes]
doc_dist = self.doc_dist[doc_indexes]
topic_words = self.topic_words[doc_topics]
topic_word_scores = self.topic_word_scores[doc_topics]
else:
if reduced:
topic_vectors = self.topic_vectors_reduced
else:
topic_vectors = self.topic_vectors
doc_topics, doc_dist = self._calculate_documents_topic(topic_vectors,
self._get_document_vectors()[doc_indexes],
num_topics=num_topics)
topic_words = np.array([self.topic_words[topics] for topics in doc_topics])
topic_word_scores = np.array([self.topic_word_scores[topics] for topics in doc_topics])
return doc_topics, doc_dist, topic_words, topic_word_scores
def add_documents(self, documents, doc_ids=None, tokenizer=None, use_embedding_model_tokenizer=False):
"""
Update the model with new documents.
The documents will be added to the current model without changing
existing document, word and topic vectors. Topic sizes will be updated.
If adding a large quantity of documents relative to the current model
size, or documents containing a largely new vocabulary, a new model
should be trained for best results.
Parameters
----------
documents: List of str
doc_ids: List of str, int (Optional)
Only required when doc_ids were given to the original model.
A unique value per document that will be used for referring to
documents in search results.
tokenizer: callable (Optional, default None)
Override the default tokenization method. If None then
gensim.utils.simple_preprocess will be used.
use_embedding_model_tokenizer: bool (Optional, default False)
If using an embedding model other than doc2vec, use the model's
tokenizer for document embedding.
"""
# if tokenizer is not passed use default
if tokenizer is None:
tokenizer = default_tokenizer
# add documents
self._validate_documents(documents)
if self.documents is not None:
self.documents = np.append(self.documents, documents)
# add document ids
if self.document_ids_provided is True:
self._validate_document_ids_add_doc(documents, doc_ids)
doc_ids_len = len(self.document_ids)
self.document_ids = np.append(self.document_ids, doc_ids)
self.doc_id2index.update(dict(zip(doc_ids, list(range(doc_ids_len, doc_ids_len + len(doc_ids))))))
elif doc_ids is None:
num_docs = len(documents)
start_id = max(self.document_ids) + 1
doc_ids = list(range(start_id, start_id + num_docs))
doc_ids_len = len(self.document_ids)
self.document_ids = np.append(self.document_ids, doc_ids)
self.doc_id2index.update(dict(zip(doc_ids, list(range(doc_ids_len, doc_ids_len + len(doc_ids))))))
else:
raise ValueError("doc_ids cannot be used because they were not provided to model during training.")
if self.embedding_model == "doc2vec":
docs_processed = [tokenizer(doc) for doc in documents]
document_vectors = np.vstack([self.model.infer_vector(doc_words=doc,
alpha=0.025,
min_alpha=0.01,
epochs=100) for doc in docs_processed])
num_docs = len(documents)
self.model.dv.count += num_docs
self.model.dv.max_rawint += num_docs
self.model.dv.vectors_norm = None
self._set_document_vectors(np.vstack([self._get_document_vectors(norm=False), document_vectors]))
self.model.dv.init_sims()
document_vectors = self._l2_normalize(document_vectors)
else:
if use_embedding_model_tokenizer:
docs_training = documents
else:
docs_processed = [tokenizer(doc) for doc in documents]
docs_training = [' '.join(doc) for doc in docs_processed]
document_vectors = self._embed_documents(docs_training)
self._set_document_vectors(np.vstack([self._get_document_vectors(), document_vectors]))
# update index
if self.documents_indexed:
# update capacity of index
current_max = self.document_index.get_max_elements()
updated_max = current_max + len(documents)
self.document_index.resize_index(updated_max)
# update index_id and doc_ids
start_index_id = max(self.index_id2doc_id.keys()) + 1
new_index_ids = list(range(start_index_id, start_index_id + len(doc_ids)))
self.index_id2doc_id.update(dict(zip(new_index_ids, doc_ids)))
self.doc_id2index_id.update(dict(zip(doc_ids, new_index_ids)))
self.document_index.add_items(document_vectors, new_index_ids)
# update topics
self._assign_documents_to_topic(document_vectors, hierarchy=False)
if self.hierarchy is not None:
self._assign_documents_to_topic(document_vectors, hierarchy=True)
def delete_documents(self, doc_ids):
"""
Delete documents from current model.
Warning: If document ids were not used in original model, deleting
documents will change the indexes and therefore doc_ids.
The documents will be deleted from the current model without changing
existing document, word and topic vectors. Topic sizes will be updated.
If deleting a large quantity of documents relative to the current model
size a new model should be trained for best results.
Parameters
----------
doc_ids: List of str, int
A unique value per document that is used for referring to documents
in search results.
"""
# make sure documents exist
self._validate_doc_ids(doc_ids, doc_ids_neg=[])
# update index
if self.documents_indexed:
# delete doc_ids from index
index_ids = [self.doc_id2index_id(doc_id) for doc_id in doc_ids]
for index_id in index_ids:
self.document_index.mark_deleted(index_id)
# update index_id and doc_ids
for doc_id in doc_ids:
self.doc_id2index_id.pop(doc_id)
for index_id in index_ids:
self.index_id2doc_id.pop(index_id)
# get document indexes from ids
doc_indexes = self._get_document_indexes(doc_ids)
# delete documents
if self.documents is not None:
self.documents = np.delete(self.documents, doc_indexes, 0)
# delete document ids
if self.document_ids is not None:
for doc_id in doc_ids:
self.doc_id2index.pop(doc_id)
keys = list(self.doc_id2index.keys())
self.document_ids = np.array(keys)
values = list(range(0, len(self.doc_id2index.values())))
self.doc_id2index = dict(zip(keys, values))
# delete document vectors
self._set_document_vectors(np.delete(self._get_document_vectors(norm=False), doc_indexes, 0))
if self.embedding_model == 'doc2vec':
num_docs = len(doc_indexes)
self.model.dv.count -= num_docs
self.model.dv.max_rawint -= num_docs
self.model.dv.vectors_norm= None
self.model.dv.init_sims()
# update topics
self._unassign_documents_from_topic(doc_indexes, hierarchy=False)
if self.hierarchy is not None:
self._unassign_documents_from_topic(doc_indexes, hierarchy=True)
def get_num_topics(self, reduced=False):
"""
Get number of topics.
This is the number of topics Top2Vec has found in the database_preparation by default.
If reduced is True, the number of reduced topics is returned.
Parameters
----------
reduced: bool (Optional, default False)
The number of original topics will be returned by default. If True
will return the number of reduced topics, if hierarchical topic
reduction has been performed.
Returns
-------
num_topics: int
"""
if reduced:
self._validate_hierarchical_reduction()
return len(self.topic_vectors_reduced)
else:
return len(self.topic_vectors)
def get_topic_sizes(self, reduced=False):
"""
Get topic sizes.
The number of documents most similar to each topic. Topics are
in increasing order of size.
The sizes of the original topics is returned unless reduced=True,
in which case the sizes of the reduced topics will be returned.
Parameters
----------
reduced: bool (Optional, default False)
Original topic sizes are returned by default. If True the
reduced topic sizes will be returned.
Returns
-------
topic_sizes: array of int, shape(num_topics)
The number of documents most similar to the topic.
topic_nums: array of int, shape(num_topics)
The unique number of every topic will be returned.
"""
if reduced:
self._validate_hierarchical_reduction()
return np.array(self.topic_sizes_reduced.values), np.array(self.topic_sizes_reduced.index)
else:
return np.array(self.topic_sizes.values), np.array(self.topic_sizes.index)
def get_topics(self, num_topics=None, reduced=False):
"""
Get topics, ordered by decreasing size. All topics are returned
if num_topics is not specified.
The original topics found are returned unless reduced=True,
in which case reduced topics will be returned.
Each topic will consist of the top 50 semantically similar words
to the topic. These are the 50 words closest to topic vector
along with cosine similarity of each word from vector. The
higher the score the more relevant the word is to the topic.
Parameters
----------
num_topics: int, (Optional)
Number of topics to return.
reduced: bool (Optional, default False)
Original topics are returned by default. If True the
reduced topics will be returned.
Returns
-------
topics_words: array of shape(num_topics, 50)
For each topic the top 50 words are returned, in order
of semantic similarity to topic.
Example:
[['database_preparation', 'deep', 'learning' ... 'artificial'], <Topic 0>
['environment', 'warming', 'climate ... 'temperature'] <Topic 1>
...]
word_scores: array of shape(num_topics, 50)
For each topic the cosine similarity scores of the
top 50 words to the topic are returned.
Example:
[[0.7132, 0.6473, 0.5700 ... 0.3455], <Topic 0>
[0.7818', 0.7671, 0.7603 ... 0.6769] <Topic 1>
...]
topic_nums: array of int, shape(num_topics)
The unique number of every topic will be returned.
"""
if reduced:
self._validate_hierarchical_reduction()
if num_topics is None:
num_topics = len(self.topic_vectors_reduced)
else:
self._validate_num_topics(num_topics, reduced)
return self.topic_words_reduced[0:num_topics], self.topic_word_scores_reduced[0:num_topics], np.array(
range(0, num_topics))
else:
if num_topics is None:
num_topics = len(self.topic_vectors)
else:
self._validate_num_topics(num_topics, reduced)
return self.topic_words[0:num_topics], self.topic_word_scores[0:num_topics], np.array(range(0, num_topics))
def get_topic_hierarchy(self):
"""
Get the hierarchy of reduced topics. The mapping of each original topic
to the reduced topics is returned.
Hierarchical topic reduction must be performed before calling this
method.
Returns
-------
hierarchy: list of ints
Each index of the hierarchy corresponds to the topic number of a
reduced topic. For each reduced topic the topic numbers of the
original topics that were merged to create it are listed.
Example:
[[3] <Reduced Topic 0> contains original Topic 3
[2,4] <Reduced Topic 1> contains original Topics 2 and 4
[0,1] <Reduced Topic 3> contains original Topics 0 and 1
...]
"""
self._validate_hierarchical_reduction()
return self.hierarchy
def hierarchical_topic_reduction(self, num_topics):
"""
Reduce the number of topics discovered by Top2Vec.
The most representative topics of the corpus will be found, by
iteratively merging each smallest topic to the most similar topic until
num_topics is reached.
Parameters
----------
num_topics: int
The number of topics to reduce to.
Returns
-------
hierarchy: list of ints
Each index of hierarchy corresponds to the reduced topics, for each
reduced topic the indexes of the original topics that were merged
to create it are listed.
Example:
[[3] <Reduced Topic 0> contains original Topic 3
[2,4] <Reduced Topic 1> contains original Topics 2 and 4
[0,1] <Reduced Topic 3> contains original Topics 0 and 1
...]
"""
self._validate_hierarchical_reduction_num_topics(num_topics)
num_topics_current = self.topic_vectors.shape[0]
top_vecs = self.topic_vectors
top_sizes = [self.topic_sizes[i] for i in range(0, len(self.topic_sizes))]
hierarchy = [[i] for i in range(self.topic_vectors.shape[0])]
count = 0
interval = max(int(self._get_document_vectors().shape[0] / 50000), 1)
while num_topics_current > num_topics:
# find smallest and most similar topics
smallest = np.argmin(top_sizes)
res = np.inner(top_vecs[smallest], top_vecs)
sims = np.flip(np.argsort(res))
most_sim = sims[1]
if most_sim == smallest:
most_sim = sims[0]
# calculate combined topic vector
top_vec_smallest = top_vecs[smallest]
smallest_size = top_sizes[smallest]
top_vec_most_sim = top_vecs[most_sim]
most_sim_size = top_sizes[most_sim]
combined_vec = self._l2_normalize(((top_vec_smallest * smallest_size) +
(top_vec_most_sim * most_sim_size)) / (smallest_size + most_sim_size))
# update topic vectors
ix_keep = list(range(len(top_vecs)))
ix_keep.remove(smallest)
ix_keep.remove(most_sim)
top_vecs = top_vecs[ix_keep]
top_vecs = np.vstack([top_vecs, combined_vec])
num_topics_current = top_vecs.shape[0]
# update topics sizes
if count % interval == 0:
doc_top = self._calculate_documents_topic(topic_vectors=top_vecs,
document_vectors=self._get_document_vectors(),
dist=False)
topic_sizes = pd.Series(doc_top).value_counts()
top_sizes = [topic_sizes[i] for i in range(0, len(topic_sizes))]
else:
smallest_size = top_sizes.pop(smallest)
if most_sim < smallest:
most_sim_size = top_sizes.pop(most_sim)
else:
most_sim_size = top_sizes.pop(most_sim - 1)
combined_size = smallest_size + most_sim_size
top_sizes.append(combined_size)
count += 1
# update topic hierarchy
smallest_inds = hierarchy.pop(smallest)
if most_sim < smallest:
most_sim_inds = hierarchy.pop(most_sim)
else:
most_sim_inds = hierarchy.pop(most_sim - 1)
combined_inds = smallest_inds + most_sim_inds
hierarchy.append(combined_inds)
# re-calculate topic vectors from clusters
doc_top = self._calculate_documents_topic(topic_vectors=top_vecs,
document_vectors=self._get_document_vectors(),
dist=False)
self.topic_vectors_reduced = self._l2_normalize(np.vstack([self._get_document_vectors()
[np.where(doc_top == label)[0]]
.mean(axis=0) for label in set(doc_top)]))
self.hierarchy = hierarchy
# assign documents to topic
self.doc_top_reduced, self.doc_dist_reduced = self._calculate_documents_topic(self.topic_vectors_reduced,
self._get_document_vectors())
# find topic words and scores
self.topic_words_reduced, self.topic_word_scores_reduced = self._find_topic_words_and_scores(
topic_vectors=self.topic_vectors_reduced)
# calculate topic sizes
self.topic_sizes_reduced = self._calculate_topic_sizes(hierarchy=True)
# re-order topics
self._reorder_topics(hierarchy=True)
return self.hierarchy
def query_documents(self, query, num_docs, return_documents=True, use_index=False, ef=None, tokenizer=None):
"""
Semantic search of documents using a text query.
The most semantically similar documents to the query will be returned.
Parameters
----------
query: string
Any sequence of text. This could be an actual question, a sentence,
a paragraph or a document.
num_docs: int
Number of documents to return.
return_documents: bool (Optional default True)
Determines if the documents will be returned. If they were not
saved in the model they will not be returned.
use_index: bool (Optional default False)
If index_documents method has been called, setting this to True
will speed up search for models with large number of documents.
ef: int (Optional default None)
Higher ef leads to more accurate but slower search. This value
must be higher than num_docs.
For more information see:
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
tokenizer: callable (Optional, default None)
** For doc2vec embedding model only **
Override the default tokenization method. If None then
gensim.utils.simple_preprocess will be used.
Returns
-------
documents: (Optional) array of str, shape(num_docs)
The documents in a list, the most similar are first.
Will only be returned if the documents were saved and if
return_documents is set to True.
doc_scores: array of float, shape(num_docs)
Semantic similarity of document to vector. The cosine similarity of
the document and vector.
doc_ids: array of int, shape(num_docs)
Unique ids of documents. If ids were not given to the model, the
index of the document in the model will be returned.
"""
self._validate_query(query)
self._validate_num_docs(num_docs)
if self.embedding_model != "doc2vec":
query_vec = self._embed_query(query)
else:
# if tokenizer is not passed use default
if tokenizer is None:
tokenizer = default_tokenizer
tokenized_query = tokenizer(query)
query_vec = self.model.infer_vector(doc_words=tokenized_query,
alpha=0.025,
min_alpha=0.01,
epochs=100)
return self.search_documents_by_vector(query_vec, num_docs, return_documents=return_documents,
use_index=use_index, ef=ef)
def query_topics(self, query, num_topics, reduced=False, tokenizer=None):
"""
Semantic search of topics using text query.
These are the topics closest to the vector. Topics are ordered by
proximity to the vector. Successive topics in the list are less
semantically similar to the vector.
Parameters
----------
query: string
Any sequence of text. This could be an actual question, a sentence,
a paragraph or a document.
num_topics: int
Number of documents to return.
reduced: bool (Optional, default False)
Original topics are searched by default. If True the
reduced topics will be searched.
tokenizer: callable (Optional, default None)
** For doc2vec embedding model only **
Override the default tokenization method. If None then
gensim.utils.simple_preprocess will be used.
Returns
-------
topics_words: array of shape (num_topics, 50)
For each topic the top 50 words are returned, in order of semantic
similarity to topic.
Example:
[['database_preparation', 'deep', 'learning' ... 'artificial'], <Topic 0>
['environment', 'warming', 'climate ... 'temperature'] <Topic 1>
...]
word_scores: array of shape (num_topics, 50)
For each topic the cosine similarity scores of the top 50 words
to the topic are returned.
Example:
[[0.7132, 0.6473, 0.5700 ... 0.3455], <Topic 0>
[0.7818', 0.7671, 0.7603 ... 0.6769] <Topic 1>
...]
topic_scores: array of float, shape(num_topics)
For each topic the cosine similarity to the search keywords will be
returned.
topic_nums: array of int, shape(num_topics)
The unique number of every topic will be returned.
"""
self._validate_query(query)
if self.embedding_model != "doc2vec":
query_vec = self._embed_query(query)
else:
# if tokenizer is not passed use default
if tokenizer is None:
tokenizer = default_tokenizer
tokenized_query = tokenizer(query)
query_vec = self.model.infer_vector(doc_words=tokenized_query,
alpha=0.025,
min_alpha=0.01,
epochs=100)
return self.search_topics_by_vector(query_vec, num_topics=num_topics, reduced=reduced)
def search_documents_by_vector(self, vector, num_docs, return_documents=True, use_index=False, ef=None):
"""
Semantic search of documents using a vector.
These are the documents closest to the vector. Documents are
ordered by proximity to the vector. Successive documents in the
list are less semantically similar to the vector.
Parameters
----------
vector: array of shape(vector dimension, 1)
The vector dimension should be the same as the vectors in
the topic_vectors variable. (i.e. model.topic_vectors.shape[1])
num_docs: int
Number of documents to return.
return_documents: bool (Optional default True)
Determines if the documents will be returned. If they were not
saved in the model they will not be returned.
use_index: bool (Optional default False)
If index_documents method has been called, setting this to True
will speed up search for models with large number of documents.
ef: int (Optional default None)
Higher ef leads to more accurate but slower search. This value
must be higher than num_docs.
For more information see:
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
Returns
-------
documents: (Optional) array of str, shape(num_docs)
The documents in a list, the most similar are first.
Will only be returned if the documents were saved and if
return_documents is set to True.
doc_scores: array of float, shape(num_docs)
Semantic similarity of document to vector. The cosine similarity of
the document and vector.
doc_ids: array of int, shape(num_docs)
Unique ids of documents. If ids were not given to the model, the
index of the document in the model will be returned.
"""
self._validate_vector(vector)
self._validate_num_docs(num_docs)
vector = self._l2_normalize(vector)
if use_index:
self._check_document_index_status()
if ef is not None:
self.document_index.set_ef(ef)
else:
self.document_index.set_ef(num_docs)
index_ids, doc_scores = self.document_index.knn_query(vector, k=num_docs)
index_ids = index_ids[0]
doc_ids = np.array([self.index_id2doc_id[index_id] for index_id in index_ids])
doc_scores = doc_scores[0]
doc_scores = np.array([1 - score for score in doc_scores])
doc_indexes = self._get_document_indexes(doc_ids)
else:
doc_indexes, doc_scores = self._search_vectors_by_vector(self._get_document_vectors(),
vector, num_docs)
doc_ids = self._get_document_ids(doc_indexes)
if self.documents is not None and return_documents:
documents = self.documents[doc_indexes]
return documents, doc_scores, doc_ids
else:
return doc_scores, doc_ids
def search_words_by_vector(self, vector, num_words, use_index=False, ef=None):
"""
Semantic search of words using a vector.
These are the words closest to the vector. Words are ordered by
proximity to the vector. Successive words in the list are less
semantically similar to the vector.
Parameters
----------
vector: array of shape(vector dimension, 1)
The vector dimension should be the same as the vectors in
the topic_vectors variable. (i.e. model.topic_vectors.shape[1])
num_words: int
Number of words to return.
use_index: bool (Optional default False)
If index_words method has been called, setting this to True will
speed up search for models with large number of words.
ef: int (Optional default None)
Higher ef leads to more accurate but slower search. This value
must be higher than num_docs.
For more information see:
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
Returns
-------
words: array of str, shape(num_words)
The words in a list, the most similar are first.
word_scores: array of float, shape(num_words)
Semantic similarity of word to vector. The cosine similarity of
the word and vector.
"""
self._validate_vector(vector)
vector = self._l2_normalize(vector)
if use_index:
self._check_word_index_status()
if ef is not None:
self.word_index.set_ef(ef)
else:
self.word_index.set_ef(num_words)
word_indexes, word_scores = self.word_index.knn_query(vector, k=num_words)
word_indexes = word_indexes[0]
word_scores = word_scores[0]
word_scores = np.array([1 - score for score in word_scores])
else:
word_indexes, word_scores = self._search_vectors_by_vector(self._get_word_vectors(),
vector, num_words)
words = np.array([self._index2word(index) for index in word_indexes])
return words, word_scores
def search_topics_by_vector(self, vector, num_topics, reduced=False):
"""
Semantic search of topics using keywords.
These are the topics closest to the vector. Topics are ordered by
proximity to the vector. Successive topics in the list are less
semantically similar to the vector.
Parameters
----------
vector: array of shape(vector dimension, 1)
The vector dimension should be the same as the vectors in
the topic_vectors variable. (i.e. model.topic_vectors.shape[1])
num_topics: int
Number of documents to return.
reduced: bool (Optional, default False)
Original topics are searched by default. If True the
reduced topics will be searched.
Returns
-------
topics_words: array of shape (num_topics, 50)
For each topic the top 50 words are returned, in order of semantic
similarity to topic.
Example:
[['database_preparation', 'deep', 'learning' ... 'artificial'], <Topic 0>
['environment', 'warming', 'climate ... 'temperature'] <Topic 1>
...]
word_scores: array of shape (num_topics, 50)
For each topic the cosine similarity scores of the top 50 words
to the topic are returned.
Example:
[[0.7132, 0.6473, 0.5700 ... 0.3455], <Topic 0>
[0.7818', 0.7671, 0.7603 ... 0.6769] <Topic 1>
...]
topic_scores: array of float, shape(num_topics)
For each topic the cosine similarity to the search keywords will be
returned.
topic_nums: array of int, shape(num_topics)
The unique number of every topic will be returned.
"""
self._validate_vector(vector)
self._validate_num_topics(num_topics, reduced)
vector = self._l2_normalize(vector)
if reduced:
self._validate_hierarchical_reduction()
topic_nums, topic_scores = self._search_vectors_by_vector(self.topic_vectors_reduced,
vector, num_topics)
topic_words = [self.topic_words_reduced[topic] for topic in topic_nums]
word_scores = [self.topic_word_scores_reduced[topic] for topic in topic_nums]
else:
topic_nums, topic_scores = self._search_vectors_by_vector(self.topic_vectors,
vector, num_topics)
topic_words = [self.topic_words[topic] for topic in topic_nums]
word_scores = [self.topic_word_scores[topic] for topic in topic_nums]
return topic_words, word_scores, topic_scores, topic_nums
def search_documents_by_topic(self, topic_num, num_docs, return_documents=True, reduced=False):
"""
Get the most semantically similar documents to the topic.
These are the documents closest to the topic vector. Documents are
ordered by proximity to the topic vector. Successive documents in the
list are less semantically similar to the topic.
Parameters
----------
topic_num: int
The topic number to search.
num_docs: int
Number of documents to return.
return_documents: bool (Optional default True)
Determines if the documents will be returned. If they were not
saved in the model they will not be returned.
reduced: bool (Optional, default False)
Original topics are used to search by default. If True the
reduced topics will be used.
Returns
-------
documents: (Optional) array of str, shape(num_docs)
The documents in a list, the most similar are first.
Will only be returned if the documents were saved and if
return_documents is set to True.
doc_scores: array of float, shape(num_docs)
Semantic similarity of document to topic. The cosine similarity of
the document and topic vector.
doc_ids: array of int, shape(num_docs)
Unique ids of documents. If ids were not given to the model, the
index of the document in the model will be returned.
"""
if reduced:
self._validate_hierarchical_reduction()
self._validate_topic_num(topic_num, reduced)
self._validate_topic_search(topic_num, num_docs, reduced)
topic_document_indexes = np.where(self.doc_top_reduced == topic_num)[0]
topic_document_indexes_ordered = np.flip(np.argsort(self.doc_dist_reduced[topic_document_indexes]))
doc_indexes = topic_document_indexes[topic_document_indexes_ordered][0:num_docs]
doc_scores = self.doc_dist_reduced[doc_indexes]
doc_ids = self._get_document_ids(doc_indexes)
else:
self._validate_topic_num(topic_num, reduced)
self._validate_topic_search(topic_num, num_docs, reduced)
topic_document_indexes = np.where(self.doc_top == topic_num)[0]
topic_document_indexes_ordered = np.flip(np.argsort(self.doc_dist[topic_document_indexes]))
doc_indexes = topic_document_indexes[topic_document_indexes_ordered][0:num_docs]
doc_scores = self.doc_dist[doc_indexes]
doc_ids = self._get_document_ids(doc_indexes)
if self.documents is not None and return_documents:
documents = self.documents[doc_indexes]
return documents, doc_scores, doc_ids
else:
return doc_scores, doc_ids
def search_documents_by_keywords(self, keywords, num_docs, keywords_neg=None, return_documents=True,
use_index=False, ef=None):
"""
Semantic search of documents using keywords.
The most semantically similar documents to the combination of the
keywords will be returned. If negative keywords are provided, the
documents will be semantically dissimilar to those words. Too many
keywords or certain combinations of words may give strange results.
This method finds an average vector(negative keywords are subtracted)
of all the keyword vectors and returns the documents closest to the
resulting vector.
Parameters
----------
keywords: List of str
List of positive keywords being used for search of semantically
similar documents.
keywords_neg: List of str (Optional)
List of negative keywords being used for search of semantically
dissimilar documents.
num_docs: int
Number of documents to return.
return_documents: bool (Optional default True)
Determines if the documents will be returned. If they were not
saved in the model they will also not be returned.
use_index: bool (Optional default False)
If index_documents method has been called, setting this to True
will speed up search for models with large number of documents.
ef: int (Optional default None)
Higher ef leads to more accurate but slower search. This value
must be higher than num_docs.
For more information see:
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
Returns
-------
documents: (Optional) array of str, shape(num_docs)
The documents in a list, the most similar are first.
Will only be returned if the documents were saved and if
return_documents is set to True.
doc_scores: array of float, shape(num_docs)
Semantic similarity of document to keywords. The cosine similarity
of the document and average of keyword vectors.
doc_ids: array of int, shape(num_docs)
Unique ids of documents. If ids were not given to the model, the
index of the document in the model will be returned.
"""
if keywords_neg is None:
keywords_neg = []
self._validate_num_docs(num_docs)
keywords, keywords_neg = self._validate_keywords(keywords, keywords_neg)
word_vecs = self._words2word_vectors(keywords)
neg_word_vecs = self._words2word_vectors(keywords_neg)
if use_index:
self._check_document_index_status()
combined_vector = self._get_combined_vec(word_vecs, neg_word_vecs)
return self.search_documents_by_vector(combined_vector, num_docs, return_documents=return_documents,
use_index=True, ef=ef)
if self.embedding_model == 'doc2vec':
sim_docs = self.model.dv.most_similar(positive=word_vecs,
negative=neg_word_vecs,
topn=num_docs)
doc_indexes = [doc[0] for doc in sim_docs]
doc_scores = np.array([doc[1] for doc in sim_docs])
else:
combined_vector = self._get_combined_vec(word_vecs, neg_word_vecs)
doc_indexes, doc_scores = self._search_vectors_by_vector(self._get_document_vectors(),
combined_vector, num_docs)
doc_ids = self._get_document_ids(doc_indexes)
if self.documents is not None and return_documents:
documents = self.documents[doc_indexes]
return documents, doc_scores, doc_ids
else:
return doc_scores, doc_ids
def similar_words(self, keywords, num_words, keywords_neg=None, use_index=False, ef=None):
"""
Semantic similarity search of words.
The most semantically similar word to the combination of the keywords
will be returned. If negative keywords are provided, the words will be
semantically dissimilar to those words. Too many keywords or certain
combinations of words may give strange results. This method finds an
average vector(negative keywords are subtracted) of all the keyword
vectors and returns the words closest to the resulting vector.
Parameters
----------
keywords: List of str
List of positive keywords being used for search of semantically
similar words.
keywords_neg: List of str
List of negative keywords being used for search of semantically
dissimilar words.
num_words: int
Number of words to return.
use_index: bool (Optional default False)
If index_words method has been called, setting this to True will
speed up search for models with large number of words.
ef: int (Optional default None)
Higher ef leads to more accurate but slower search. This value
must be higher than num_docs.
For more information see:
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
Returns
-------
words: array of str, shape(num_words)
The words in a list, the most similar are first.
word_scores: array of float, shape(num_words)
Semantic similarity of word to keywords. The cosine similarity of
the word and average of keyword vectors.
"""
if keywords_neg is None:
keywords_neg = []
keywords, keywords_neg = self._validate_keywords(keywords, keywords_neg)
word_vecs = self._words2word_vectors(keywords)
neg_word_vecs = self._words2word_vectors(keywords_neg)
combined_vector = self._get_combined_vec(word_vecs, neg_word_vecs)
num_res = min(num_words + len(keywords) + len(keywords_neg), self._get_word_vectors().shape[0])
# if use_index:
words, word_scores = self.search_words_by_vector(vector=combined_vector,
num_words=num_res,
use_index=use_index,
ef=ef)
res_indexes = [index for index, word in enumerate(words)
if word not in list(keywords) + list(keywords_neg)][:num_words]
words = words[res_indexes]
word_scores = word_scores[res_indexes]
return words, word_scores
def search_topics(self, keywords, num_topics, keywords_neg=None, reduced=False):
"""
Semantic search of topics using keywords.
The most semantically similar topics to the combination of the keywords
will be returned. If negative keywords are provided, the topics will be
semantically dissimilar to those words. Topics will be ordered by
decreasing similarity to the keywords. Too many keywords or certain
combinations of words may give strange results. This method finds an
average vector(negative keywords are subtracted) of all the keyword
vectors and returns the topics closest to the resulting vector.
Parameters
----------
keywords: List of str
List of positive keywords being used for search of semantically
similar documents.
keywords_neg: (Optional) List of str
List of negative keywords being used for search of semantically
dissimilar documents.
num_topics: int
Number of documents to return.
reduced: bool (Optional, default False)
Original topics are searched by default. If True the
reduced topics will be searched.
Returns
-------
topics_words: array of shape (num_topics, 50)
For each topic the top 50 words are returned, in order of semantic
similarity to topic.
Example:
[['database_preparation', 'deep', 'learning' ... 'artificial'], <Topic 0>
['environment', 'warming', 'climate ... 'temperature'] <Topic 1>
...]
word_scores: array of shape (num_topics, 50)
For each topic the cosine similarity scores of the top 50 words
to the topic are returned.
Example:
[[0.7132, 0.6473, 0.5700 ... 0.3455], <Topic 0>
[0.7818', 0.7671, 0.7603 ... 0.6769] <Topic 1>
...]
topic_scores: array of float, shape(num_topics)
For each topic the cosine similarity to the search keywords will be
returned.
topic_nums: array of int, shape(num_topics)
The unique number of every topic will be returned.
"""
if keywords_neg is None:
keywords_neg = []
keywords, keywords_neg = self._validate_keywords(keywords, keywords_neg)
word_vecs = self._words2word_vectors(keywords)
neg_word_vecs = self._words2word_vectors(keywords_neg)
combined_vector = self._get_combined_vec(word_vecs, neg_word_vecs)
return self.search_topics_by_vector(combined_vector, num_topics=num_topics, reduced=reduced)
def search_documents_by_documents(self, doc_ids, num_docs, doc_ids_neg=None, return_documents=True,
use_index=False, ef=None):
"""
Semantic similarity search of documents.
The most semantically similar documents to the semantic combination of
document ids provided will be returned. If negative document ids are
provided, the documents will be semantically dissimilar to those
document ids. Documents will be ordered by decreasing similarity. This
method finds the closest document vectors to the provided documents
averaged.
Parameters
----------
doc_ids: List of int, str
Unique ids of document. If ids were not given, the index of
document in the original corpus.
doc_ids_neg: (Optional) List of int, str
Unique ids of document. If ids were not given, the index of
document in the original corpus.
num_docs: int
Number of documents to return.
return_documents: bool (Optional default True)
Determines if the documents will be returned. If they were not
saved in the model they will also not be returned.
use_index: bool (Optional default False)
If index_documents method has been called, setting this to True
will speed up search for models with large number of documents.
ef: int (Optional default None)
Higher ef leads to more accurate but slower search. This value
must be higher than num_docs.
For more information see:
https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
Returns
-------
documents: (Optional) array of str, shape(num_docs)
The documents in a list, the most similar are first.
Will only be returned if the documents were saved and if
return_documents is set to True.
doc_scores: array of float, shape(num_docs)
Semantic similarity of document to keywords. The cosine similarity
of the document and average of keyword vectors.
doc_ids: array of int, shape(num_docs)
Unique ids of documents. If ids were not given to the model, the
index of the document in the model will be returned.
"""
if doc_ids_neg is None:
doc_ids_neg = []
self._validate_num_docs(num_docs)
self._validate_doc_ids(doc_ids, doc_ids_neg)
doc_indexes = self._get_document_indexes(doc_ids)
doc_indexes_neg = self._get_document_indexes(doc_ids_neg)
if use_index:
self._check_document_index_status()
document_vectors = self._get_document_vectors()
doc_vecs = [document_vectors[ind] for ind in doc_indexes]
doc_vecs_neg = [document_vectors[ind] for ind in doc_indexes_neg]
combined_vector = self._get_combined_vec(doc_vecs, doc_vecs_neg)
return self.search_documents_by_vector(combined_vector, num_docs, return_documents=return_documents,
use_index=True, ef=ef)
if self.embedding_model == 'doc2vec':
sim_docs = self.model.dv.most_similar(positive=doc_indexes,
negative=doc_indexes_neg,
topn=num_docs)
doc_indexes = [doc[0] for doc in sim_docs]
doc_scores = np.array([doc[1] for doc in sim_docs])
else:
doc_vecs = [self.document_vectors[ind] for ind in doc_indexes]
doc_vecs_neg = [self.document_vectors[ind] for ind in doc_indexes_neg]
combined_vector = self._get_combined_vec(doc_vecs, doc_vecs_neg)
num_res = min(num_docs + len(doc_indexes) + len(doc_indexes_neg),
self._get_document_vectors().shape[0])
# don't return documents that were searched
search_doc_indexes = list(doc_indexes) + list(doc_indexes_neg)
doc_indexes, doc_scores = self._search_vectors_by_vector(self._get_document_vectors(),
combined_vector, num_res)
res_indexes = [index for index, doc_ind in enumerate(doc_indexes)
if doc_ind not in search_doc_indexes][:num_docs]
doc_indexes = doc_indexes[res_indexes]
doc_scores = doc_scores[res_indexes]
doc_ids = self._get_document_ids(doc_indexes)
if self.documents is not None and return_documents:
documents = self.documents[doc_indexes]
return documents, doc_scores, doc_ids
else:
return doc_scores, doc_ids
def generate_topic_wordcloud(self, topic_num, background_color="black", reduced=False):
"""
Create a word cloud for a topic.
A word cloud will be generated and displayed. The most semantically
similar words to the topic will have the largest size, less similar
words will be smaller. The size is determined using the cosine distance
of the word vectors from the topic vector.
Parameters
----------
topic_num: int
The topic number to search.
background_color : str (Optional, default='white')
Background color for the word cloud image. Suggested options are:
* white
* black
reduced: bool (Optional, default False)
Original topics are used by default. If True the
reduced topics will be used.
Returns
-------
A matplotlib plot of the word cloud with the topic number will be
displayed.
"""
if reduced:
self._validate_hierarchical_reduction()
self._validate_topic_num(topic_num, reduced)
word_score_dict = dict(zip(self.topic_words_reduced[topic_num],
softmax(self.topic_word_scores_reduced[topic_num])))
else:
self._validate_topic_num(topic_num, reduced)
word_score_dict = dict(zip(self.topic_words[topic_num],
softmax(self.topic_word_scores[topic_num])))
plt.figure(figsize=(16, 4),
dpi=200)
plt.axis("off")
plt.imshow(
WordCloud(width=1600,
height=400,
background_color=background_color).generate_from_frequencies(word_score_dict))
plt.title("Topic " + str(topic_num), loc='left', fontsize=25, pad=20)
\ No newline at end of file
import sys, os
from tqdm import tqdm
import pandas as pd
import pickle
from database_preparation.preprocess import print_meta_data
from database_preparation.utils_labeled_datasets import is_text_lst_tokenized
sys.path.append(os.getcwd())
# parse arguments:
from TextClustering.argsparse_clustering_preamble import argsparse_preamble
args = argsparse_preamble()
print("arguments:")
print(args)
if is_text_lst_tokenized(args.path2corpus):
print("Error: "+args.path2corpus + '.pkl is tokenized! '
'Please pass texts list where each text is a single string!')
exit(1)
#%% load the data
with open(args.path2corpus, 'rb') as f:
diag_lst = pickle.load(f)
print_meta_data(args.path2corpus)
#%% perform clustering repetitive
if args.find_k_value:
from TextClustering.utils_metrics import ClusterMetrics
from TextClustering.basedOn_Embedding.top2vec import Top2Vec
import matplotlib.pyplot as plt # load our modified version (for visualization)
s_score, n_cluster = [], []
cluster_size = range(3, 25, 2)
for i_cluster_size in tqdm(cluster_size):
#%% perform text-clustering (like in the paper)
hdbscan_args = {'min_cluster_size': i_cluster_size,
'metric': 'euclidean',
'cluster_selection_method': 'eom'}
model = Top2Vec(diag_lst,
embedding_model=args.model2use,
min_count=0,
hdbscan_args=hdbscan_args)
#%% get the clusters
n_cluster.append(model.get_num_topics())
evaluation = ClusterMetrics(model.umap_model.embedding_[model.result.labels >= 0,],
model.clustered.labels.tolist())
s_score.append(evaluation.s_score)
#%% plot the results
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(cluster_size, s_score, 'bx-')
ax2.plot(cluster_size, n_cluster, 'rx-')
ax1.set_xlabel('Minimal cluster size')
ax1.yaxis.label.set_color('blue')
ax1.set_ylabel('Silhouette Coefficient')
ax2.yaxis.label.set_color('red')
ax2.set_ylabel('Number of clusters')
plt.title('Ellbow-method-like plot')
plt.show()
exit()
#%% perform text-clustering (like in the paper)
from TextClustering.basedOn_Embedding.top2vec import Top2Vec # load our modified version (for visualization)
hdbscan_args = {'min_cluster_size': args.k_value,
'metric': 'euclidean',
'cluster_selection_method': 'eom'}
model = Top2Vec(diag_lst,
embedding_model = args.model2use,
min_count = 0,
hdbscan_args=hdbscan_args)
#%% get the words and topics
# by their way
model.get_num_topics()
topic_sizes, topic_nums = model.get_topic_sizes()
# print some infos:
outliers = model.umap_model.embedding_[model.result.labels == -1,]
print(f"found {len(topic_nums)} topics")
print(f"found {len(outliers)} outilers.")
topic_words, word_scores, topic_nums = model.get_topics(len(topic_nums))
pd.DataFrame(topic_words).to_excel(
'TextClustering/tables/WordsPerCluster_Top2Vec_' + args.model2use + '.xlsx',
sheet_name= "in-function")
# save umaped vectors and labels:
df = pd.read_pickle(args.df_cases_file)
df['umapX_top2vec'] = model.result.x
df['umapY_top2vec'] = model.result.y
df['label_top2vec'] = model.result.labels
df.to_pickle(args.df_cases_file)
#%% calculate clustering-metrics
from TextClustering.utils_metrics import ClusterMetrics
evaluation = ClusterMetrics(model.umap_model.embedding_[model.result.labels >= 0,], model.clustered.labels.tolist(),
file_name= "TextClustering/cluster_metrics/top2vec_metrics.pkl")
evaluation.write_to_file()
\ No newline at end of file
# -*- coding: iso-8859-1 -*-
import pickle
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import database_preparation.utils_labeled_datasets as dt
from TextClassification.classification_for_cluster_evaluation import cross_validate_with_simple_SVM
from CorpusHomogeneity.cluster_entropy import cluster_entropy
from CorpusHomogeneity.text_entropy import corpus_entropy
recalc_cls_accuracy = True
use_always_bow_data_for_svm_accuracy = True
sort_table_by = ['s-score'] # s-score or cls accuracy
table_save_path = 'TextClustering/tables/cluster_metrics_overview'
path2corpus_bow_preprocessed = 'database/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed = 'database/embedding_prepro_diag.pkl'
scorepath = "TextClustering/cluster_metrics/"
df_cases_file = './database/df_cases.pkl'
def main():
# ########## print cluster scores as latex table: ##################
methodnames = ['KMeans', 'LDA', 'HDBSCAN', 'German_BERT', 'Patho_BERT', 'top2vec', 'GSDPMM']
skipped_methods = []
print(dt.get_all_label_set_ids())
s_scores = []
entropy_scores = []
cls_ac_scores = []
cluster_nums = []
report_nums = []
round_to = 3
for label_set in methodnames:
try:
scores = pd.read_pickle(scorepath + label_set + "_metrics.pkl")[label_set+'_metrics']
except:
print(f"skipping {label_set}.")
skipped_methods.append(label_set)
continue
if label_set in ['German_BERT', 'Patho_BERT', 'top2vec']:
text_corpus_path = path2corpus_embedding_preprocessed
else:
text_corpus_path = path2corpus_bow_preprocessed
try:
s_scores.append(round(scores['s-score'],3))
except:
s_scores.append(None)
try:
cluster_nums.append(str(dt.get_amount_unique_labels(label_set)))
except:
cluster_nums.append(None)
try:
report_nums.append(str(dt.get_amount_reports(label_set)))
except:
report_nums.append(None)
### cls accuracy with svm ###
if recalc_cls_accuracy:
if use_always_bow_data_for_svm_accuracy:
metrics = cross_validate_with_simple_SVM(label_set,
path2corpus_bow_preprocessed,
df_cases_file)
else:
metrics = cross_validate_with_simple_SVM(label_set,
text_corpus_path,
df_cases_file)
print("================ f1-per cluster for cluster-set: " + label_set + " ================")
df = metrics.classes_scores(-1)
print(df.to_latex().replace('{}', 'cluster'))
cls_ac_scores.append(round(np.mean(metrics.scores['accuracy']), round_to))
else:
try:
cls_ac_scores.append(round(scores['svm-accuracy'], round_to))
except:
report_nums.append(None)
### calculate entropy ###
with open(text_corpus_path, 'rb') as f:
text = pickle.load(f)
df = pd.read_pickle(df_cases_file)
clusters = df['label_'+label_set].tolist()
frame = pd.DataFrame({'text': text, 'cluster': clusters}, index=[clusters])
ent = cluster_entropy(frame)
ent_mean, ent_std = corpus_entropy(text)
entropy_scores.append(round(ent[0] / ent_mean, round_to))
for methodname in skipped_methods:
methodnames.remove(methodname)
methodnames = [n.replace("_metrics", "") for n in methodnames]
df = pd.DataFrame({'cluster method': methodnames, 's-score': s_scores,
'cls accuracy': cls_ac_scores, 'rel entropy': entropy_scores,
'clusters': cluster_nums, 'corpus size': report_nums})
df.sort_values(by=sort_table_by, inplace=True, ascending=False)
latex_table = df.to_latex(index=False)
print("%================== clustering metric scores =================")
print(latex_table)
print("%===================================\n\n")
with open(table_save_path+'_latex.txt', 'w') as f:
f.write(latex_table)
df.to_excel(table_save_path+'.xlsx')
if __name__ == '__main__':
main()
\ No newline at end of file
import os, sys
# params:
path2corpus_bow_preprocessed_diagnosis = 'database/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed_diagnosis = 'database/embedding_prepro_diag.pkl'
# check if we are at correct working directory:
workdir = os.getcwd()
if not workdir[-len('nlp-in-diagnostic-texts-from-nephropathology'):] == 'nlp-in-diagnostic-texts-from-nephropathology':
print(workdir + " is the wrong working directory.")
print("please make shure to run this script with working directory '.../path/to/nlp-in-diagnostic-texts-from-nephropathology'.")
exit(1)
# add some folders for metrics, plot, tables etc:
if not os.path.isdir('TextClustering/cluster_metrics'):
os.makedirs('TextClustering/cluster_metrics')
if not os.path.isdir('TextClustering/tables'):
os.makedirs('TextClustering/tables')
if not os.path.isdir('TextClustering/plots'):
os.makedirs('TextClustering/plots')
if not os.path.isdir('TextClustering/plots/histograms'):
os.makedirs('TextClustering/plots/histograms')
if not os.path.isdir('TextClustering/plots/UMAP'):
os.makedirs('TextClustering/plots/UMAP')
if not os.path.isdir('TextClustering/plots/PCA'):
os.makedirs('TextClustering/plots/PCA')
# Construct clustering pipeline. This is a suggestion how to use all the scripts.
# I also recommend to run each clustering script one by on to fintune the clusterings (with argument --find_k_value)
script_queue = [
f"python TextClustering/basedOn_BOW/kmeans_Diagnosis.py --path2corpus {path2corpus_bow_preprocessed_diagnosis} --k_value {10}",
f"python TextClustering/basedOn_BOW/LDA_Diagnosis.py --path2corpus {path2corpus_bow_preprocessed_diagnosis} --k_value {12}",
f"python TextClustering/basedOn_BOW/HDBSCAN_Diagnosis.py --path2corpus {path2corpus_bow_preprocessed_diagnosis} --k_value {10}",
f"python TextClustering/basedOn_BOW/GSDPMM_Diagnosis.py --path2corpus {path2corpus_bow_preprocessed_diagnosis} --k_value {14}",
f"python TextClustering/basedOn_Embedding/BERT_Diagnosis.py --path2corpus {path2corpus_embedding_preprocessed_diagnosis} --do_embedding --model2use German_BERT --k_value {17}",
f"python TextClustering/basedOn_Embedding/BERT_Diagnosis.py --path2corpus {path2corpus_embedding_preprocessed_diagnosis} --do_embedding --model2use Patho_BERT --k_value {8}",
f"python TextClustering/basedOn_Embedding/top2vec_Diagnosis.py --path2corpus {path2corpus_embedding_preprocessed_diagnosis} --model2use doc2vec --k_value {7}",
"python TextClustering/cluster_scores2latextable.py",
"python TextClustering/plot_clustersets.py",
"python TextClustering/generate_topicwords.py",
"python TextClustering/clusterset_histos.py"
]
for script in script_queue:
print("\n########################################### executing ###########################################")
print(script)
print("####################################################################################################\n")
os.system(script)
import database_preparation.utils_labeled_datasets as dt
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys, os
import argparse
sys.path.append(os.getcwd())
parser = argparse.ArgumentParser()
parser.add_argument("--df_cases_file", default="database/df_cases.pkl")
args = parser.parse_args()
plot_author_histos = False
cluster = 2
clustersets = ["HDBSCAN", "KMeans", "LDA", "GSDPMM",
"top2vec", "Patho_BERT", "German_BERT"]
df = pd.read_pickle(args.df_cases_file)
authors_labels = df["label_author"]
# plot histograms: how much docs do have the same label=cluster-index?
for i,label_set in enumerate(clustersets):
try:
cluster_labels = dt.label_list_as_int_list(df['label_' + label_set])
except:
print(f"skipping {label_set}. it is not in the df_cases_file.")
continue
if plot_author_histos:
authors_of_cluster = [authors_labels[i] for i, label in enumerate(cluster_labels) if
label == cluster]
authors = np.asarray(authors_of_cluster)
x = [-1,0,1,2,3]
h = []
for l in x:
h.append(sum([1 for a in authors if a == l]))
plt.bar(x, height=h)
plt.title(label_set + " authors in cluster " + str(cluster))
file_path = 'TextClustering/plots/histograms/histogram_' + label_set + "_cluster" + str(cluster) + "_authors.png"
else:
labels = np.asarray([l for l in cluster_labels if l != -1])
label_num = dt.get_amount_unique_labels(label_set)
x = np.arange(label_num)
h = []
for l in x:
h.append(sum([1 for label in labels if label == l]))
plt.bar(x, height=h)
plt.xticks(x, x)
plt.title(label_set)
plt.title(label_set)
file_path = 'TextClustering/plots/histograms/histogram_' + label_set + ".png"
plt.xticks(x, x)
plt.savefig(file_path, dpi=600)
plt.close()
plt.clf()
print(f"generated {file_path}")
from TextClustering.utils_wordlist import generate_save_topicwords
import pandas as pd
from database_preparation.utils_labeled_datasets import label_list_as_int_list
from database_preparation.preprocess import get_metadata
import pickle
import openpyxl
# parameters:
df_cases_file = "database/df_cases.pkl"
print_latex = False
filter_stop_words = True
path2umap_pics = 'TextClustering/plots/UMAP/'
save_umap_picture_in_table = True
path2corpus_bow_preprocessed = 'database/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed = 'database/embedding_prepro_diag.pkl'
####### functions ##########
def main():
cluster_sets = ['KMeans', 'LDA', 'HDBSCAN', 'German_BERT', 'Patho_BERT', 'top2vec', 'GSDPMM']
# cluster_sets = ['German_BERT']
df_cases = pd.read_pickle(df_cases_file)
for cluster_set in cluster_sets:
# re-generate the topic words:
excel_file_path = 'TextClustering/tables/WordsPerCluster_' + cluster_set + '.xlsx'
# convert nan-values in int(-1):
try:
clusters = label_list_as_int_list(df_cases['label_' + cluster_set])
except:
print(f"skipping {cluster_set}. it is not in the df_cases_file.")
continue
if cluster_set in ['German_BERT', 'Patho_BERT', 'top2vec']:
text_corpus_path = path2corpus_embedding_preprocessed
else:
text_corpus_path = path2corpus_bow_preprocessed
meta_params = get_metadata(text_corpus_path)
with open(text_corpus_path, 'rb') as f:
diag_lst = pickle.load(f)
# do not apply stopwordfilterg if it is already stopword filtered!
generate_save_topicwords(clusters, diag_lst, save_excel_file_path=excel_file_path,
n_words=10, print_latex_table=print_latex,
filter_stop_words=filter_stop_words and not meta_params['stopword_filtered'])
if save_umap_picture_in_table:
pic_path = path2umap_pics + cluster_set + "_UMAP.png"
try:
img = openpyxl.drawing.image.Image(pic_path)
wb = openpyxl.load_workbook(excel_file_path)
ws = wb.create_sheet("umap")
img.anchor = 'A1'
img.width = img.width / 2
img.height = img.height / 2
ws.add_image(img)
wb.save(excel_file_path)
print(f"Generated {excel_file_path}")
except:
print("could not load " + pic_path)
print("therefore, cant place umap picture into " + excel_file_path)
######### topic words of authors #########
clusters = label_list_as_int_list(df_cases['label_author'])
excel_file_path = 'TextClustering/tables/WordsPerCluster_authors.xlsx'
with open(path2corpus_bow_preprocessed, 'rb') as f:
diag_lst = pickle.load(f)
generate_save_topicwords(clusters, diag_lst, save_excel_file_path=excel_file_path,
n_words=20, print_latex_table=print_latex,
filter_stop_words=False)
if __name__ == '__main__':
main()
import pandas as pd
from TextClustering.utils_metrics import cluster_scatter_plot
import numpy as np
from database_preparation.utils_labeled_datasets import label_list_as_int_list
clustersets = ["GSDPMM", "KMeans", "LDA", "HDBSCAN",
"top2vec", "Patho_BERT", "German_BERT"]
plot_titles = ["GSDPMM (UMAP representation)", "k-means (UMAP representation)",
"LDA (UMAP representation)", "HDBSCAN (UMAP representation)",
"top2vec (UMAP representation)", "Patho-BERT (UMAP representation)",
"German-BERT (UMAP representation)"]
df_cases_file = "database/df_cases.pkl"
def save_umap_plot(clustersetname, df, title=None):
if not 'label_' + clustersetname in df:
print("skipping " + clustersetname + ", it is not in df_cases_file:")
print(df)
return
predictedCluster_text_features = label_list_as_int_list(df['label_' + clustersetname])
try:
umap_text_features2D = np.asarray([[e for e in df['umapX_' + clustersetname]],
[e for e in df['umapY_' + clustersetname]]])
except:
print("there is no umapX_" + clustersetname + " in database/df_cases.pkl. => skipping")
return
umap_text_features2D = np.transpose(umap_text_features2D)
cluster_scatter_plot(umap_text_features2D, predictedCluster_text_features,
"TextClustering/plots/UMAP/" + clustersetname + "_UMAP.png",
show_plot=False, colorblindfriendly=False, fig_title=title)
if 'label_author' in df:
author_labels = df["label_author"]
cluster_scatter_plot(umap_text_features2D, author_labels,
"TextClustering/plots/UMAP/" + clustersetname + "_UMAP_authors.png",
show_plot=False, colorblindfriendly=True, number_data_points=False
, fig_title=title + ", colored by authors")
if 'label_golden' in df:
golden_labels = df["label_golden"]
cluster_scatter_plot(umap_text_features2D, golden_labels,
"TextClustering/plots/UMAP/" + clustersetname + "_UMAP_goldenlabel.png",
show_plot=False, colorblindfriendly=True
, fig_title=title + " colored with golden labels")
def main():
df = pd.read_pickle(df_cases_file)
for clustersetname in clustersets:
if clustersetname in clustersets:
title = plot_titles[clustersets.index(clustersetname)]
else:
title = None
save_umap_plot(clustersetname, df, title)
# plot author-colored and cluster-colored lda clustersets as pca representation:
if 'label_LDA' in df and 'pcaX_LDA' in df:
predictedCluster_text_features = df['label_LDA']
features2D = np.asarray([[e for e in df['pcaX_LDA']],
[e for e in df['pcaY_LDA']]])
features2D = np.transpose(features2D)
cluster_scatter_plot(features2D, predictedCluster_text_features,
"TextClustering/plots/PCA/LDA_PCA.png",
show_plot=False, colorblindfriendly=False,
fig_title="LDA (PCA representation)")
cluster_scatter_plot(features2D, df["label_author"],
"TextClustering/plots/PCA/LDA_PCA_authors.png",
show_plot=False, colorblindfriendly=True,
number_data_points=False, fig_title='LDA (PCA representation), colored by authors')
if __name__ == '__main__':
main()
import openpyxl
from TextClustering.utils_wordlist import get_top_cluster_words_as_latex_table
from googletrans import Translator # use pip install googletrans==3.1.0a0, 3.0 version is broken
from utils_general import custom_translation
path2table = "WordsPerCluster_HDBSCAN.xlsx"
green = 'FF00FF00'
blue = 'FF4A86E8'
orange = 'FFFF9900'
black = '1'
latex_weak_word = '\\weakcolor'
latex_strong_word = '\\strongcolor'
def color2latex_color(color):
if color == green:
return latex_strong_word
if color == blue:
return latex_weak_word
if color == orange:
return latex_weak_word
# print(f"unknown color: {color}")
return None
def get_annotated_exceltable(ws):
words_list = []
topics = []
colors = []
for idx, col in enumerate(ws.iter_rows(min_row=2, max_row=25, min_col=1, max_col=11)):
if col[0].value is None:
break
words_list.append([])
colors.append([])
for i, cell in enumerate(col):
if i == 0:
topics.append((cell.value, color2latex_color(cell.font.color.rgb)))
else:
words_list[idx].append(cell.value)
colors[idx].append(color2latex_color(cell.font.color.rgb))
# return get_top_cluster_words_as_latex_table(words_list, colors, topics)
return words_list, colors, topics
def main():
wb = openpyxl.load_workbook(path2table)
extraction_methods = ['tf-idf', 'SVM']
cluster_method = 'HDBSCAN'
anotate_svm_as_tfidf = True
print_also_translated_tables = True
translator = Translator()
words_list_tfidf = []
colorstfidf = []
topicstfidf = []
for i, extraction_method in enumerate(extraction_methods):
ws = wb[['TFIDF-based', 'svm-based'][i]]
words_list, colors, topics = get_annotated_exceltable(ws)
if anotate_svm_as_tfidf:
if extraction_method != 'tf-idf':
topics = topicstfidf
for j, words in enumerate(words_list):
for k, word in enumerate(words):
if word in words_list_tfidf[j]:
colors[j][k] = colorstfidf[j][words_list_tfidf[j].index(word)]
else:
words_list_tfidf, colorstfidf, topicstfidf = words_list, colors, topics
# print german topic words:
label = 'table_cluster_topics_' + cluster_method + '_' + extraction_method + '_ger'
# print("\n processing " + label+"...\n")
description = f'Annotated German topic words, extracted from the {cluster_method} cluster-set, ' \
f'using the {extraction_method} based extraction method.'
latex = get_top_cluster_words_as_latex_table(words_list, colors, topics).replace('DESCRIPTON',
description).replace(
'EXTRACTIONMETHOD', extraction_method).replace(
'LABEL', label
)
print(latex)
# print english topic words:
if print_also_translated_tables:
description = f'Annotated topic words (translated from German to English), ' \
f'extracted from the {cluster_method} cluster-set, ' \
f'using the {extraction_method} based extraction method.'
label = 'table_cluster_topics_' + cluster_method + '_' + extraction_method + '_eng'
word_list_eng = [[word if word.lower() not in custom_translation.keys() else custom_translation[word.lower()]
for word in words] for words in words_list]
topics_eng = [(translator.translate(topic[0], src='de').text, topic[1]) for topic in topics]
latex = get_top_cluster_words_as_latex_table(word_list_eng, colors, topics_eng).replace('DESCRIPTON',
description).replace(
'EXTRACTIONMETHOD', extraction_method).replace(
'LABEL', label
)
print(latex)
if __name__ == '__main__':
main()
#%% import
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from validclust import cop, dunn
from sklearn.metrics import pairwise_distances
import pandas as pd
import os
#%% class definition
class ClusterMetrics:
def __init__(self, feature_matrix, feature_label, file_name = "cluster_metrics.pkl"):
self.file_name = file_name
self.feature_matrix = feature_matrix
if (type(feature_matrix).__name__) == "csr_matrix":
self.__feature_matrix_array__ = feature_matrix.toarray()
elif (type(feature_matrix).__name__) == "list":
self.__feature_matrix_array__ = np.array(feature_matrix)
else:
self.__feature_matrix_array__ = feature_matrix
self.feature_label = feature_label
self.__feature_label_array = np.array(feature_label)
self.__distance_matrix__ = pairwise_distances(self.__feature_matrix_array__)
# calculate Silhouette Coefficien (values -1 to 1)
self.s_score = silhouette_score(feature_matrix, feature_label)
# caculate Calinski-Harabasz Index (the higher the value, the better)
self.ch_index = calinski_harabasz_score(self.__feature_matrix_array__ , feature_label)
# calcualte the Davies-Bouldin Index (the highter, the better)
self.db_score = davies_bouldin_score(self.__feature_matrix_array__, feature_label)
# calculate COP CVI
self.cop = cop(self.__feature_matrix_array__, self.__distance_matrix__, self.__feature_label_array)
# calculate Dunn CVI
self.dunn = dunn(self.__distance_matrix__, self.__feature_label_array)
# place for entropy
self.entropy = None
self.svm_accuracy = None
def write_to_file(self):
results = [np.round(self.s_score,3),
np.round(self.ch_index,3),
np.round(self.db_score,3),
np.round(self.cop,3),
np.round(self.dunn,3),
self.entropy,
self.svm_accuracy]
head, tail = os.path.split(self.file_name)
tail = tail[:-4]
df = pd.DataFrame(results,
index =['s-score', 'ch-index', 'db-score', 'cop', 'dunn-score', 'entropy', 'svm-accuracy'],
columns =[tail])
df.to_pickle(self.file_name)
print(df)
def __str__(self):
return "s-score: " + str(np.round(self.s_score,2)) + "[-1:1]" + "\n" + \
"ch-index: " + str(np.round(self.ch_index,2)) + " [0:]" "\n" + \
"db-score: " + str(np.round(self.db_score, 2)) + "[0:]" "\n" + \
"cop: " + str(np.round(self.cop, 2)) + " []" "\n" + \
"dunn: " + str(np.round(self.dunn, 2)) + " []" "\n" + \
"entropy: " + str(self.entropy) + " []" "\n" + \
"svm-accuracy: " + str(self.svm_accuracy) + " []"
#%% define plot functions for PCA
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
def plot_pca(text_features, labels, file_path = [], show_plot = True):
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(text_features)
plt.close()
plt.scatter(x=reduced_features[:, 0], y=reduced_features[:, 1],
c=np.int8(labels), cmap="tab20")
plt.colorbar()
plt.title('PCA-representation')
if bool(file_path):
plt.savefig(file_path)
if show_plot:
plt.show()
#%% define plot function for T-SNE
from sklearn.manifold import TSNE
def plot_tsne(text_features, labels, file_path = [], show_plot = True):
tsne = TSNE(n_components=2, verbose=1, random_state=123)
reduced_features = tsne.fit_transform(text_features)
plt.close()
plt.scatter(x=reduced_features[:, 0], y=reduced_features[:, 1],
c=np.int8(labels), cmap="tab20")
plt.colorbar()
plt.title('T-SNE-representation')
if bool(file_path):
plt.savefig(file_path)
if show_plot:
plt.show()
#%% define plot function for UMAP
import umap
import seaborn as sns
def cluster_scatter_plot(umap_text_features2D, labels, file_path = [],
show_plot=True, colorblindfriendly=True,
number_data_points=True, fig_title=None):
'''umap_text_features2D = umap.UMAP(n_neighbors=15,
n_components=2,
min_dist=0.0, metric='cosine').fit_transform(text_features)'''
plt.close()
# plot unvalid labeled datapoints in black:
x = [e for i, e in enumerate(umap_text_features2D[:, 0]) if labels[i] == -1]
y = [e for i, e in enumerate(umap_text_features2D[:, 1]) if labels[i] == -1]
if len(x) > 0:
plt.scatter(x=x, y=y, c='black', marker='.')
# plot valid labeled datapoints:
x_val = [e for i,e in enumerate(umap_text_features2D[:, 0]) if labels[i] != -1]
y_val = [e for i,e in enumerate(umap_text_features2D[:, 1]) if labels[i] != -1]
valid_labels = [l for l in labels if l != -1]
if colorblindfriendly:# use colorblind palette, it has 10 colors
style = []
for label in valid_labels:
if label <= 9: #0-9
style.append(0)
elif label >= 20: #20-inf
style.append(1)
else: #10-19
style.append(2)
sns.scatterplot(x=x_val, y=y_val, hue=valid_labels,
palette="colorblind", style=style,
legend=True, linewidth=.3)
if number_data_points:
nummerate_clusters_in_plot(x_val, y_val, valid_labels)
else: # use tap20, it has 20 different colors
x = [e for i, e in enumerate(x_val) if valid_labels[i] <= 19]
y = [e for i, e in enumerate(y_val) if valid_labels[i] <= 19]
c = [e for e in valid_labels if e <= 19]
plt.scatter(x=x,
y=y,
c=np.int8(c),
cmap="tab20", edgecolors='white', linewidth=.3
, marker='o')
if number_data_points:
nummerate_clusters_in_plot(x,y,c)
'''plt.legend(handles=scatter.legend_elements()[0],
labels=[str(l) for l in c], loc="best")'''
plt.colorbar(values=[int(e) for e in np.unique(np.asarray(c))])
c = [e for e in valid_labels if e > 19]
if len(c)>0:
x = [e for i, e in enumerate(x_val) if valid_labels[i] > 19]
y = [e for i, e in enumerate(y_val) if valid_labels[i] > 19]
plt.scatter(x=x, y=y,
c=np.int8(c),
cmap="tab20", edgecolors='white', linewidth=.3
, marker='P')
if number_data_points:
nummerate_clusters_in_plot(x, y, c)
if fig_title is None:
if bool(file_path):
import os
fig_title = os.path.basename(file_path)
else:
fig_title = "UMAP"
plt.title(fig_title.replace(".png",""))
if bool(file_path):
print("generated "+file_path)
plt.savefig(file_path,dpi=300)
if show_plot:
plt.show()
def nummerate_clusters_in_plot(x,y,labels):
annotated_labels = []
for i, label in enumerate(labels):
if label not in annotated_labels:
plt.annotate(label, (x[i], y[i]))
annotated_labels.append(label)
def Jaccard_Similarity(doc1, doc2):
if isinstance(doc1, list):
doc1 = " ".join(doc1)
doc2 = " ".join(doc2)
# List the unique words in a document
words_doc1 = set(doc1.lower().split())
words_doc2 = set(doc2.lower().split())
#print(words_doc1)
#print(words_doc2)
# Find the intersection of words list of doc1 & doc2
intersection = words_doc1.intersection(words_doc2)
# Find the union of words list of doc1 & doc2
union = words_doc1.union(words_doc2)
# Calculate Jaccard similarity score
# using length of intersection set divided by length of union set
return float(len(intersection)) / len(union)
#%%
import numpy as np
def get_distance_matrix(str_list):
dist_matrix = np.zeros(shape=(len(str_list), len(str_list)))
# calculate the lower triangle
for i in range(0, len(str_list)):
for j in range(i+1, len(str_list)):
dist_matrix[i][j] = Jaccard_Similarity(str_list[i], str_list[j])
# fill the upper triangle
for i in range(0, len(str_list)):
for j in range(0, len(str_list)):
if i == j:
dist_matrix[i][j] = 0
elif i > j:
dist_matrix[i][j] = dist_matrix[j][i]
return dist_matrix
# -*- coding: iso-8859-1 -*-
from database_preparation.utils_stringpreparation import get_most_frequent_words
import numpy as np
import pandas
import yake
import nltk
import openpyxl as pxl
from database_preparation.utils_labeled_datasets import text_label_2_labeled_dataset
import pandas as pd
from database_preparation.utils_labeled_datasets import label_list_as_int_list
from database_preparation.stop_word_list import filter_stopwords
def get_nwordlist(text_lst, cluster_lst, n_words=10,
excel_file_path=[],
method="frequency-based",
filter_stop_words=True):
"""
- if file_path given, the wordlist will be saved as .xlsx with sheet name = method
"""
ignore_outlier = True
# %% prepare parameter
docs = {'text': text_lst, 'cluster': cluster_lst}
n_cluster = np.unique(cluster_lst)
frame = pandas.DataFrame(docs, index=[cluster_lst])
text1 = np.asarray(text_lst[0])
text_lst_is_tokenized = bool(text1.ndim)
if method in ['svm-based', 'LR-based']:
###### prepare text data: ######
if text_lst_is_tokenized:
if filter_stop_words:
dataset = text_label_2_labeled_dataset(
[filter_stopwords(text) for text in text_lst], cluster_lst
)
else:
dataset = text_label_2_labeled_dataset(text_lst, cluster_lst)
else:
# print("passed text list is not tokenized. Tokenizing it now with nltk...")
# tokenize
tokenized_texts = []
for t_text in text_lst:
if filter_stop_words:
tokenized_texts.append(filter_stopwords(nltk.tokenize.word_tokenize(t_text, language='german')))
else:
tokenized_texts.append(nltk.tokenize.word_tokenize(t_text, language='german'))
dataset = text_label_2_labeled_dataset(tokenized_texts, cluster_lst)
###### train svm: ######
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegressionCV
def identity(words):
return words
def get_trained_svm(texts, labels):
vec = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
# svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
# lsa = make_pipeline(vec, svd)
# clf = SVC(probability=True, kernel="linear")
clf = LinearSVC()
# pipe = make_pipeline(lsa, clf)
pipe = make_pipeline(vec, clf)
pipe.fit(texts, labels)
return pipe, vec, clf
def get_trained_LR(text, labels):
# vec = CountVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
vec = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(text, labels)
return pipe, vec, clf
if method == 'svm-based':
pipe, vec, clf = get_trained_svm(dataset['text'], dataset['label'])
else:
pipe, vec, clf = get_trained_LR(dataset['text'], dataset['label'])
def get_correct_predictions(texts, labels, pipe):
y_preds = pipe.predict(texts)
correct_predictions = []
for i, y_pred in enumerate(y_preds):
if y_pred == labels[i]:
correct_predictions.append(i)
return correct_predictions
correct_predictions = get_correct_predictions(dataset['text'], dataset['label'], pipe)
'''print("found " + str(len(correct_predictions)) + '/' +
str(len(dataset['text'])) + " correct predicted docs.")'''
del clf, vec, pipe
if method == 'svm-based':
pipe, vec, clf = get_trained_svm(dataset[correct_predictions]['text'],
dataset[correct_predictions]['label'])
else:
pipe, vec, clf = get_trained_LR(dataset[correct_predictions]['text'],
dataset[correct_predictions]['label'])
'''print("using "+method+" which predicted " +
str(len(
get_correct_predictions(dataset[correct_predictions]['text'], dataset[correct_predictions]['label'], pipe)))
+ '/' + str(len(dataset[correct_predictions]['text'])) + " documents correctly.")'''
########## weight analysis ##############
# get feature importance:
feature_names = vec.get_feature_names_out()
top_word_lists = []
for i, coef in enumerate(clf.coef_):
if clf.__class__.__name__ == 'SVC':
coef = coef.toarray()
weights = list(zip(feature_names, [coef[0, i] for i in range(coef.shape[1])]))
else:
weights = list(zip(vec.get_feature_names_out(), coef))
most_positives = sorted(weights, key=lambda x: -x[1])[:n_words]
most_netatives = sorted(weights, key=lambda x: x[1])[:n_words]
top_word_lists.append([tup[0] for tup in most_positives])
'''f_mnames = vec.get_feature_names_out()
coefs = clf.coef_.ravel()
top_positive_coefficients = np.argsort(coefs)[-n_words:]
top_negative_coefficients = np.argsort(coefs)[:n_words]
top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
print(f_mnames[top_coefficients])'''
save_topwordlist_as_excel(excel_file_path, top_word_lists, method)
return top_word_lists
# %% prepare different methods
if method == "TFIDF-based":
cluster_words = []
clusters = list(np.unique(cluster_lst))
if ignore_outlier: # if yes: remove cluster -1
if -1 in clusters:
clusters.remove(-1)
if text_lst_is_tokenized:
for i_cluster in clusters:
t_frame = frame[frame['cluster'] == i_cluster]
if filter_stop_words:
t_text = []
for text in t_frame['text'].to_list():
filtered = filter_stopwords(text)
t_text.append(filtered)
else:
t_text = t_frame['text'].to_list()
words_of_cluster = [word for text in t_text for word in text]
cluster_words.append(words_of_cluster)
else:
for i_cluster in clusters:
t_frame = frame[frame['cluster'] == i_cluster]
t_text = []
for text in t_frame['text'].to_list():
tokenized = nltk.tokenize.word_tokenize(text, language='german')
if filter_stop_words:
t_text.append(filter_stopwords(tokenized))
else:
t_text.append(tokenized)
words_of_cluster = [word for text in t_text for word in text]
cluster_words.append(words_of_cluster)
if method == "yake":
def flatten(lst):
text = ''
for t_text in lst:
text += str(t_text)
return text
kw_extractor = yake.KeywordExtractor(lan='German',
n=1,
dedupLim=0.9,
dedupFunc='seqm',
windowsSize=1,
top=n_words, features=None)
# %% perform it for TFIDF
if method == "TFIDF-based":
# %% prepare the corpus for tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
corpus = [str(i) for i in cluster_words]
vectors = vectorizer.fit_transform(corpus)
names = vectorizer.get_feature_names_out()
data = vectors.todense().tolist()
df = pandas.DataFrame(data, columns=names)
# %% create the words list
word_list = []
for i in df.iterrows():
i_words = i[1].sort_values(ascending=False)[:n_words]
word_list.append(i_words.index.to_list())
else:
# %% perform it for the other approaches
clusters = []
word_list = []
for i_cluster in n_cluster:
# ignore outlier
if ignore_outlier:
if i_cluster < 0:
continue
t_frame = frame[frame['cluster'] == i_cluster]
if method == "frequency-based":
if text_lst_is_tokenized:
if filter_stop_words:
t_text = [filter_stopwords(text) for text in t_frame['text'].to_list()]
else:
t_text = t_frame['text'].to_list()
else:
t_text = []
for text in t_frame['text'].to_list():
tokenized = nltk.tokenize.word_tokenize(text, language='german')
if filter_stop_words:
t_text.append(filter_stopwords(tokenized))
else:
t_text.append(tokenized)
top_words = get_most_frequent_words(str(t_text), n_words)
elif method == "yake":
if text_lst_is_tokenized:
# join word-lists into single string
if filter_stop_words:
cluster_text_list = [filter_stopwords(text) for text in t_frame['text'].to_list()]
else:
cluster_text_list = t_frame['text'].to_list()
t_text = [' '.join(text) for text in cluster_text_list]
else:
if filter_stop_words: # tokenize->filterstopwords->join
t_text_tokenized = []
for text in t_frame['text'].to_list(): # tokenize+filtering
tokenized = nltk.tokenize.word_tokenize(text, language='german')
t_text_tokenized.append(filter_stopwords(tokenized))
t_text = [' '.join(text) for text in t_text_tokenized] # join
else:
t_text = t_frame['text'].to_list()
keywords = kw_extractor.extract_keywords(flatten(t_text))
top_words = [i_key[0] for i_key in keywords]
clusters.append(i_cluster)
word_list.append(top_words)
# %% how to get n_words in an array
for i in range(0, len(word_list)):
t_token = np.array(word_list[i])
if len(t_token) < n_words + 1:
t_token = np.append(t_token, np.repeat(np.nan, n_words - len(t_token)))
t_token = t_token.reshape((1, -1))
if i == 0:
token_list = t_token
else:
token_list = np.concatenate((token_list, t_token), axis=0)
# %% save it to the excel (bad style the way to the dark side is...)
save_topwordlist_as_excel(excel_file_path, token_list, method)
# %% return the results
return word_list
def save_topwordlist_as_excel(file_path, token_list, sheet_name):
if bool(file_path):
try: # if excelfile does exist, append new sheet to workbook:
excel_book = pxl.load_workbook(file_path)
if sheet_name in excel_book.get_sheet_names():
excel_book.remove_sheet(sheet_name)
with pandas.ExcelWriter(file_path, engine='openpyxl', if_sheet_exists=None) as writer:
writer.book = excel_book
writer.sheets = {
worksheet.title: worksheet
for worksheet in excel_book.worksheets
}
pandas.DataFrame(token_list).to_excel(writer, sheet_name)
writer.save()
except: # otherwise: create new workbook and save
pandas.DataFrame(token_list).to_excel(file_path, sheet_name=sheet_name)
def generate_save_topicwords(predictedClusters, text_lst, save_excel_file_path,
n_words=10, print_latex_table=False,
extraction_methods=['TFIDF-based', 'frequency-based', 'yake', 'svm-based'],
filter_stop_words=True):
'''
creates n_words most relevant topic-words with tfifg, tf and yake
and saves it as .xlsx and as latex table
'''
latextable = ''
for method in extraction_methods:
word_list = get_nwordlist(text_lst, predictedClusters,
n_words=n_words,
excel_file_path=save_excel_file_path,
method=method,
filter_stop_words=filter_stop_words)
latextable = latextable + "\n%================== " + save_excel_file_path + " " + method + " ================="
latextable = latextable + get_top_cluster_words_as_latex_table(word_list)
latextable = latextable + "%========================================================\n"
if print_latex_table:
print(latextable)
with open(save_excel_file_path.replace('.xlsx', '') + '_latex.txt', 'w') as f:
f.write(latextable)
def get_top_cluster_words_as_latex_table(words_lists, colors=None, cluster_topics=None):
first_part = '''
%%%%%%%%%%%%%%%%%%%%%% LABEL %%%%%%%%%%%%%%%%%%%%%%%%%%
\\begin{table}[!htb]
\caption{DESCRIPTON}\label{LABEL}
\\noindent
\\begin{tabularx}{\linewidth}{|@{}>{}l|@{\hspace{.5em}}X@{}|}
\hline
\\textbf{ cluster index - cluster name } & \\textbf{keywords according to EXTRACTIONMETHOD} \\\\ \\hline
'''
end_part = '''
\end{tabularx}
\end{table}
'''
if colors is None or cluster_topics is None:
latex_code_m = ""
for i, words in enumerate(words_lists):
latex_code_m = latex_code_m + str(i) + " & " + ", ".join(words) + " \\\\ \hline \n"
else:
latex_code_m = ""
colored_tops = [int(float(top[0])) if top[1] is None else '\\colorbox{' + top[1] + '}{' + top[0] + '}' for top in
cluster_topics]
for i, words in enumerate(words_lists):
colored_words = [word if colors[i][j] is None else '\\colorbox{'+colors[i][j]+'}{'+word+'}' for j,word in enumerate(words)]
latex_code_m = latex_code_m + str(colored_tops[i]) + " & " + ", ".join(colored_words) + " \\\\ \hline"
if i+1 < len(words_lists):
latex_code_m = latex_code_m + '\n'
latex_code = first_part + latex_code_m + end_part
latex_code = latex_code.replace("_", "\_")
# print("================== cluster topics " + cluster_method + " =================")
# print(latex_code)
# print("========================================================")
return latex_code
def print_excel_topciwordlist_as_latex(cluster_set, excel_file_path=None, topic_word_method="TFIDF-based"):
if excel_file_path == None:
excel_file_path = 'TextClustering/tables/WordsPerCluster_' + cluster_set + '_temp.xlsx'
df_cases = pd.read_pickle("database/df_cases.pkl")
# convert nan-values in int(-1):
clusters = label_list_as_int_list(df_cases['label_' + cluster_set])
df = pd.read_excel(open(excel_file_path, 'rb'), sheet_name=topic_word_method).T
df.drop(['Unnamed: 0'], inplace=True)
rename_dic = {}
for i in range(np.max(clusters) + 1):
rename_dic[i] = ' ' + str(i)
df.rename(columns=rename_dic, inplace=True)
latex_code = '''
\\begin{table}[h]
\caption{DESCRIPTON}\label{table_cluster_topics}
\\noindent
\\begin{tabularx}{\linewidth}{|@{}>{\\bfseries}l|@{\hspace{.5em}}X@{}|}
\hline
'''
latex_code_m = ""
for k in rename_dic.values():
latex_code_m = latex_code_m + "\\textbf{" + str(k) + "} & " + ", ".join(df[k]) + " \\\\ \hline \n"
latex_code = latex_code + latex_code_m + '''
\end{tabularx}
\end{table}
'''
latex_code = latex_code.replace("_", "\_")
print("%================== cluster topics of cluster-set" + cluster_set + " =================")
print(latex_code)
print("%========================================================")
def main():
pass
if __name__ == '__main__':
main()
{"source_data": "../DataNephroTexts/description", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
{"source_data": "../DataNephroTexts/diagnosis", "tokenized": true, "cased": false, "stopword_filtered": true, "use_combiner": true, "use_replacer": true, "lemma_mode": 3, "punct_mode": 2, "number_mode": 3}
\ No newline at end of file
{"source_data": "../DataNephroTexts/description", "tokenized": false, "cased": true, "stopword_filtered": false, "use_combiner": true, "use_replacer": true, "lemma_mode": 4, "punct_mode": 1, "number_mode": 3}
\ No newline at end of file
{"source_data": "../DataNephroTexts/diagnosis", "tokenized": false, "cased": true, "stopword_filtered": false, "use_combiner": true, "use_replacer": true, "lemma_mode": 4, "punct_mode": 1, "number_mode": 3}
\ No newline at end of file
from transformers import AutoModelForMaskedLM, AutoTokenizer
import pickle
# script parameters:
modelname = "bert-base-german-cased"
path2corpus_embedding_preprocessed_diagnosis = 'database/embedding_prepro_diag.pkl'
path2corpus_embedding_preprocessed_description = 'database/embedding_prepro_desc.pkl'
tokenizer = AutoTokenizer.from_pretrained(modelname)
model = AutoModelForMaskedLM.from_pretrained(modelname)
unknown_id = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
with open(path2corpus_embedding_preprocessed_description, 'rb') as f:
micro_texts = pickle.load(f)
with open(path2corpus_embedding_preprocessed_diagnosis, 'rb') as f:
diag_texts = pickle.load(f)
def find_oov_cases(texts):
oov_cases = 0
for text_num, text in enumerate(texts):
if unknown_id in tokenizer.encode(text):
tokens = text.split(" ")
for i, token in enumerate(tokens):
if unknown_id in tokenizer.encode(token):
oov_cases += 1
print("found OOV case in text " + str(text_num))
print("the word \'" + str(token) + "\' in " + str(tokens[i - 2:i + 2]) + " is OOV")
return oov_cases
oov_sum = find_oov_cases(micro_texts) + find_oov_cases(diag_texts)
print("\nFinished. Found " + str(oov_sum) + " OOV cases (see above).")
\ No newline at end of file
# -*- coding: iso-8859-1 -*-
import os
# params:
path_to_reports = '../DataNephroTexts/reports'
author_names = "Name1 Name2 Name3 Name4" ## <- Type in the names of the pathologists of your institut!
splitted_reports_folder_path = '../DataNephroTexts'
path2corpus_bow_preprocessed_diagnosis = 'database/bow_prepro_diag.pkl'
path2corpus_embedding_preprocessed_diagnosis = 'database/embedding_prepro_diag.pkl'
path2corpus_bow_preprocessed_description = 'database/bow_prepro_desc.pkl'
path2corpus_embedding_preprocessed_description = 'database/embedding_prepro_desc.pkl'
# check if we are at correct working directory:
workdir = os.getcwd()
if not workdir[-len('nlp-in-diagnostic-texts-from-nephropathology'):] == 'nlp-in-diagnostic-texts-from-nephropathology':
print(workdir + " is the wrong working directory.")
print("please make shure to run this script with working directory '.../path/to/nlp-in-diagnostic-texts-from-nephropathology'.")
exit(1)
preperateion_queue = [
"python database_preparation/split_reports.py --path_to_reports " + path_to_reports + " --target_folder_path " + splitted_reports_folder_path + " --author_names \"" + author_names + '\"',
f"python database_preparation/preprocess.py --path_to_preprocessing_params {path2corpus_bow_preprocessed_diagnosis.replace('.pkl','_meta.json')} --target_path {path2corpus_bow_preprocessed_diagnosis}",
f"python database_preparation/preprocess.py --path_to_preprocessing_params {path2corpus_embedding_preprocessed_diagnosis.replace('.pkl','_meta.json')} --target_path {path2corpus_embedding_preprocessed_diagnosis}",
f"python database_preparation/preprocess.py --path_to_preprocessing_params {path2corpus_bow_preprocessed_description.replace('.pkl','_meta.json')} --target_path {path2corpus_bow_preprocessed_description}",
f"python database_preparation/preprocess.py --path_to_preprocessing_params {path2corpus_embedding_preprocessed_description.replace('.pkl','_meta.json')} --target_path {path2corpus_embedding_preprocessed_description}",
]
for script in preperateion_queue:
print("\n########################################### executing ###########################################")
print(script)
print("####################################################################################################\n")
os.system(script)
# -*- coding: iso-8859-1 -*-
import sys, os
import pandas as pd
from database_preparation.utils_stringpreparation import read_german_text
import argparse
def amount_names(text):
return len(text.split('Dr.'))-1
def cut_off_by_keywords(text, keywords=['Nachtragsbefund','Nachbericht']):
'''
cuts of (removes) the text-part which begins with any of the passed keyword(s)
and returns the new (shortened) text.
'''
for keyword in keywords:
if keyword in text:
text = text[:text.index(keyword)]
return text
def get_names(text):
names = []
for parts in text.split('Dr. med.')[1:]:
tokens = parts.split(' ')
for token in tokens:
if '.' in token:
continue
if True in [c.isdigit() for c in token]:
continue
if 'Tel' in token:
continue
if token in '- war Befundverwendung f�r wissenschaftliche Zwecke oder Gutachten nur mit Genehmigung des Befunders OA PD':
continue
names.append(token)
return names
def add_author_labels_to_df_cases(path_to_end_sections, authors, df_cases_file = "database/df_cases.pkl"):
df = pd.read_pickle(df_cases_file)
filenames = df["end_text_files"]
author_labels = []
print(f"\nLabeling df_cases file with authors. Searching for {authors} in {path_to_end_sections}")
for idx, filename in enumerate(filenames):
text = cut_off_by_keywords(read_german_text(path_to_end_sections + '/' + filename))
# detect authors in text
authors_in_text = [0 for a in range(len(authors))]
for j, author in enumerate(authors):
if author in text:
authors_in_text[j] = 1
# if only one author detected:
autor_combination_as_decimal = sum([pow(2, i) * n for i, n in enumerate(authors_in_text)])
if sum(authors_in_text) == 1:
label = authors_in_text.index(1)
else:
label = -1
author_labels.append(label)
df['label_author'] = author_labels
df.to_pickle(df_cases_file)
print("=> finished. Results:")
for i, author in enumerate(authors):
num = 0
for label in author_labels:
if label == i:
num += 1
print(author + " accured " + str(num) + " times")
sum_no_author = 0
for label in author_labels:
if label == -1:
sum_no_author = sum_no_author + 1
print(str(sum_no_author) + " unknown authors.")
return True
def main():
# parse arguments:
sys.path.append(os.getcwd())
parser = argparse.ArgumentParser()
parser.add_argument("--path_to_end_sections",
default='../DataNephroTexts/end')
parser.add_argument("--author_names",
default="Name1 Name2")
args = parser.parse_args()
authors = args.author_names.split(' ')
add_author_labels_to_df_cases(args.path_to_end_sections, authors)
if __name__ == '__main__':
main()
\ No newline at end of file
# -*- coding: iso-8859-1 -*-
import sys, os
import glob
from tqdm import tqdm
import pickle
import random
import nltk
from HanTa import HanoverTagger as ht
from enum import Enum
from database_preparation.utils_stringpreparation import read_german_text
from database_preparation.utils_wordbase import RegexpReplacer, RegexpSynonyms
from database_preparation.stop_word_list import filter_stopwords
import json
import argparse
########## define enums ##########
class LemmatizeMode(Enum):
lemma_only_nouns = 1
lemma_only_nouns_adja = 2
lemma = 3
none = 4
class PunctuationMode(Enum):
keep = 1
remove = 2
replace = 3
class NumberMode(Enum):
keep = 1
remove = 2
replace = 3
########## define some symbols and lists ##########
num_replace_symbol = "*"
punct_replace_symbol = "--"
punctuations_to_remove = ['%', '=', '+', '-', '?', '<', '>', '\'', '``', '\'\'',
',', ';', '.', '*', '#', '', '\\', '/', '(', ')', '[', ']',
'{', '}', '~', ':']
do_not_lemma_list = ['igg', 'iga', 'igm']
########## Functions ##########
def prepro_params_2_string(params):
metadata_text = ""
for i, key in enumerate(params.keys()):
metadata_text = metadata_text + key + ': '
if key == 'lemma_mode':
metadata_text = metadata_text + str(LemmatizeMode(params[key])).replace('LemmatizeMode.', '')
elif key == 'punct_mode':
metadata_text = metadata_text + str(PunctuationMode(params[key])).replace('PunctuationMode.', '')
elif key == 'number_mode':
metadata_text = metadata_text + str(NumberMode(params[key])).replace('NumberMode.', '')
else:
metadata_text = metadata_text + str(params[key])
if i < len(params.keys()) - 1:
metadata_text = metadata_text + '\n'
return metadata_text
def get_metadata(path_to_pickled_prepro_text_list):
try:
with open(path_to_pickled_prepro_text_list.replace('.pkl','_meta.json')) as json_file:
params = json.load(json_file)
return params
except:
return None
def print_meta_data(path_to_pickled_prepro_text_list):
try:
params = get_metadata(path_to_pickled_prepro_text_list)
print(prepro_params_2_string(params))
print()
return True
except:
return False
def is_histo_num(word):
if word[:2].lower() == "h/" and word[2].isdigit():
return True
return False
def is_date(word):
if '.20' in word and word[0].isdigit() and word[-1].isdigit():
return True
return False
def get_corpus_stats(path2corpus):
corpus_stats = {'total_token_count': 0, 'amount_docs': 0, 'tokens_per_doc': 0, }
file_list = glob.glob(path2corpus + '/*.txt')
for idx, t_file in tqdm(enumerate(file_list)):
t_text = read_german_text(t_file)
t_text = nltk.tokenize.word_tokenize(t_text, language='german')
corpus_stats['total_token_count'] += len(t_text)
corpus_stats['tokens_per_doc'] = float(corpus_stats['total_token_count']) / float(len(file_list))
corpus_stats['amount_docs'] = len(file_list)
return corpus_stats
def preprocess(parameter_dict):
"""
prepocesses a corpus, which is at source_data_path=.../path_to_corpus_folder.
This folder (corpus) should contain the .txt files which should be processed.
The .txt files should be named with name in the form <name>#<number>.txt
returns preprocessed_corpus as list of shape:
[first_preprocessed_text, second_preprocessed_text, ...]
Histo numbers and dates will always be removed!
"""
source_data_path = parameter_dict['source_data']
do_tokenize = parameter_dict['tokenized']
cased = parameter_dict['cased']
stopword_filtered = parameter_dict['stopword_filtered']
use_combiner = parameter_dict['use_combiner']
use_replacer = parameter_dict['use_replacer']
lemma_mode = parameter_dict['lemma_mode']
punct_mode = parameter_dict['punct_mode']
number_mode = parameter_dict['number_mode']
lemma_mode = LemmatizeMode(lemma_mode)
punct_mode = PunctuationMode(punct_mode)
number_mode = NumberMode(number_mode)
combiner = RegexpSynonyms()
replacer = RegexpReplacer()
tagger = ht.HanoverTagger('morphmodel_ger.pgz')
file_list = glob.glob(source_data_path + '/*.txt')
file_list = sorted(file_list, key=lambda f: int(f[f.find("#") + 1:-4]))
preprocessed_corpus = []
random_example_idx = random.randrange(min(10, len(file_list)))
for idx, t_file in tqdm(enumerate(file_list)):
# %% load the txt-file
t_text = read_german_text(t_file)
original_text = t_text
# replace the words
if use_replacer:
t_text = replacer.replace(t_text)
# tokenize
t_text = nltk.tokenize.word_tokenize(t_text, language='german')
# filter stopwords
if stopword_filtered:
t_text = filter_stopwords(t_text)
# combine word pairs
if use_combiner:
t_text = combiner.combine(t_text)
# lemmatize / stemming
t_text = tagger.tag_sent(t_text)
# %% lemmarize the text
if lemma_mode == LemmatizeMode.lemma_only_nouns:
t_text = [lemma for (word, lemma, pos) in t_text if pos == "NN" or pos == "NE"]
elif lemma_mode == LemmatizeMode.lemma_only_nouns_adja:
t_text = [lemma for (word, lemma, pos) in t_text if pos == "NN" or pos == "NE" or pos == "ADJA"]
elif lemma_mode == LemmatizeMode.lemma:
lemmatized_text = []
for (word, lemma, pos) in t_text:
if lemma == '--' or word.lower() in do_not_lemma_list:
lemmatized_text.append(word)
else:
lemmatized_text.append(lemma)
t_text = lemmatized_text
del lemmatized_text
else: # none
t_text = [word for (word, lemma, pos) in t_text]
# %% filter punctuation:
if punct_mode == PunctuationMode.remove:
t_text = [token for token in t_text if token not in punctuations_to_remove]
elif punct_mode == PunctuationMode.replace:
t_text = [token if token not in punctuations_to_remove else punct_replace_symbol for token in t_text]
# %% number filtering
filtered_text = []
use_single_symbol = True
for i, word in enumerate(t_text):
# always remove dates and histonums:
if is_histo_num(word) or is_date(word):
continue
if number_mode != NumberMode.keep:
if word.isdigit(): # remove dumbers
if number_mode == NumberMode.replace:
filtered_text.append('*' if use_single_symbol else '_zahl_')
continue
elif number_mode == NumberMode.remove:
continue
elif ',' in word: # remove "0,3"
w = word.split(',')
if len(w) == 2:
if w[0].isdigit() and w[1].isdigit():
if number_mode == NumberMode.replace:
filtered_text.append('*' if use_single_symbol else 'x,y')
continue
elif number_mode == NumberMode.remove:
continue
elif word[0].isdigit() and word[-1] == 'nm': # remove distances like 500nm
if number_mode == NumberMode.replace:
filtered_text.append('*' if use_single_symbol else 'x_nm')
continue
elif number_mode == NumberMode.remove:
continue
elif '/' in word: # remove stuff like 6/10
w = word.split('/')
if len(w) == 2:
if w[0].isdigit() and w[1].isdigit():
if number_mode == NumberMode.replace:
filtered_text.append('*' if use_single_symbol else 'x/y')
continue
elif number_mode == NumberMode.remove:
continue
elif '-' in word: # remove stuff like 5-10
w = word.split('-')
if len(w) == 2:
if w[0].isdigit() and w[1].isdigit():
if number_mode == NumberMode.replace:
filtered_text.append('*' if use_single_symbol else 'x-y')
continue
elif number_mode == NumberMode.remove:
continue
elif word[0].isdigit() and word[-1].lower() == 'x': # remove 6x ('six times ...')
if number_mode == NumberMode.replace:
filtered_text.append('*' if use_single_symbol else 'x_mal')
continue
elif number_mode == NumberMode.remove:
continue
elif word[0].isdigit() and word[-1].lower() == '%': # remove 5_%
if number_mode == NumberMode.replace:
filtered_text.append('*' if use_single_symbol else 'x_%')
continue
elif number_mode == NumberMode.remove:
continue
elif '+' in word: # remmove sum expressions like "3+3+3=9/20"
if number_mode == NumberMode.replace:
filtered_text.append('*' if use_single_symbol else '_summe_')
continue
elif number_mode == NumberMode.remove:
continue
elif word == 'cm' or word == 'mm': # remove also distance words:
if number_mode == NumberMode.replace:
filtered_text.append('*' if use_single_symbol else '_distanz_')
continue
elif number_mode == NumberMode.remove:
continue
filtered_text.append(word)
t_text = filtered_text
del filtered_text
# always lower the text at the end, otherwise
# cased sensitive operations might not work anymore!
if not cased:
t_text = [word.lower() for word in t_text]
# resample if we want it to be tokenized:
if not do_tokenize:
t_text = ' '.join(t_text)
# %% add to the list
preprocessed_corpus.append(t_text)
if idx == random_example_idx:
print("-------------- Preprocessing Example: ---------------")
print("Original text of " + t_file + ":")
print(original_text)
print("Processed text:")
print(t_text)
print("-----------------------------\n")
return preprocessed_corpus
def main():
# parse arguments:
sys.path.append(os.getcwd())
parser = argparse.ArgumentParser()
parser.add_argument("--path_to_preprocessing_params",
default='database/bow_prepro_diag_meta.json')
parser.add_argument("--target_path",
default='database/bow_prepro_diag.pkl')
args = parser.parse_args()
with open(args.path_to_preprocessing_params) as json_file:
params = json.load(json_file)
print(f"------ Preprocessing parameters: ------")
print(prepro_params_2_string(params))
print()
preprocessed_corpus = preprocess(params)
with open(args.target_path, 'wb') as f:
pickle.dump(preprocessed_corpus, f)
print(f"saved preprocessed corpus at {args.target_path}")
'''print(get_corpus_stats("../DataNephroTexts/description"))
print(get_corpus_stats("../DataNephroTexts/diagnosis"))
print(get_corpus_stats("../DataNephroTexts/end"))'''
if __name__ == '__main__':
main()
from database_preparation.utils_stringpreparation import read_german_text
import re
def get3parts(t_file):
#%% load the text
t_text = read_german_text(t_file)
#%% define codon-find function
def find_codon(text, word_list):
codon = 'XENOTARSOSAURUS'
for i_word in word_list:
if text.find(i_word) > -1:
codon = i_word
return codon
#%% get start codons for description and diagnosis
# thinking that one pathologist sticks to his/her wording
start_codon_description = find_codon(t_text,
['Lichtmikroskopie:', 'Mikroskopie:',"Histologie:"])
start_codon_2nd = find_codon(t_text,
["Nachbericht", "Immunhistochemie"])
start_codon_conclusion = find_codon(t_text,
["Beurteilung:", "Begutachtung:"])
start_codon_comment = find_codon(t_text,
["Kommentar"])
if t_text.find("Mit freundlichen") > -1:
start_codon_greetings = "Mit freundlichen"
else:
start_codon_greetings = "Prof."
#%% set the stop codons and prepare the function
# like on DNA, the next start codon is a stop codon
stop_codon_list = [start_codon_conclusion, start_codon_description,
start_codon_comment, start_codon_greetings, start_codon_2nd]
from database_preparation.utils_stringpreparation import regexp
def get_codon_idx(text, start_codon, stop_codon_list):
_, idx_start = regexp(start_codon, text)
idx_stop = []
for i_idx_start in idx_start:
idx_stop_list = []
for i_stop_codon in stop_codon_list:
if not i_stop_codon == start_codon:
idx_stop_list.append(text[i_idx_start:].find(i_stop_codon))
idx_stop_list = [item for item in idx_stop_list if item >= 0]
idx_stop.append(min(idx_stop_list) + i_idx_start)
return idx_start, idx_stop
#%% find the indices for the text-frames
start_description, stop_description = get_codon_idx(t_text,
start_codon_description,
stop_codon_list)
start_2nd, stop_2nd = get_codon_idx(t_text,
start_codon_2nd,
stop_codon_list)
start_conclusion, stop_clonclusion = get_codon_idx(t_text,
start_codon_conclusion,
stop_codon_list)
#%% get the text parts
def get_text_frame(idx_start_list, idx_stop_list, text):
t_frame = []
for i in range(0, len(idx_start_list)):
t_frame.append(text[idx_start_list[i]:idx_stop_list[i]])
return t_frame
txt_micro = get_text_frame(start_description,stop_description, t_text)
txt_2nd = get_text_frame(start_2nd, stop_2nd, t_text)
txt_conclusion= get_text_frame(start_conclusion,stop_clonclusion, t_text)
#%% finalise the text
txt_conclusion = txt_conclusion[-1]
if not txt_2nd == []:
txt_micro = str(txt_micro[0]) + str(txt_2nd[0])
else:
txt_micro = str(txt_micro[0])
# get greetings-section:
start_greedingsindex = t_text.find("Mit freundlichen")
txt_greetings = None
if start_greedingsindex == -1:
start_greedingsindex = t_text.find("Prof.")
if start_greedingsindex != -1:
txt_greetings = t_text[start_greedingsindex:]
return txt_micro, txt_conclusion, txt_greetings
import pandas as pd
import pickle
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from database_preparation.preprocess import print_meta_data, prepro_params_2_string
# parameters:
df_cases_file = "database/df_cases.pkl"
text_corpus_paths = ['database/embedding_prepro_diag.pkl',
'database/bow_prepro_diag.pkl',
'database/embedding_prepro_desc.pkl',
'database/bow_prepro_desc.pkl']
vector_corpus_paths = ['database/diagnosis_texts_vectorized_DR_preprocessed.pkl',
'database/diagnosis_texts_vectorized_bow_preprocessed.pkl',
'database/description_texts_vectorized_DR_preprocessed.pkl',
'database/description_texts_vectorized_bow_preprocessed.pkl']
####### functions ##########
def identity(words):
return words
def get_trained_tfidf(texts):
vec = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)
return vec.fit_transform(texts)
def save_vectorized_text(text_corpus_path, vector_corpus_path):
with open(text_corpus_path, 'rb') as f:
text_lst = pickle.load(f)
text1 = np.asarray(text_lst[0])
text_lst_is_tokenized = bool(text1.ndim)
if not text_lst_is_tokenized:
tokenized_texts = []
for t_text in text_lst:
tokenized_texts.append(nltk.tokenize.word_tokenize(t_text, language='german'))
text_lst = tokenized_texts
del tokenized_texts
vectorized_text = get_trained_tfidf(text_lst)
with open(vector_corpus_path, 'wb') as f:
pickle.dump(vectorized_text, f)
print(f"saved {vector_corpus_path}")
# save metadata:
'''with open(text_corpus_path.replace('.pkl', '_meta.pkl'), 'rb') as f:
params = pickle.load(f)
metadata_text = prepro_params_2_string(params)
with open(vector_corpus_path.replace('.pkl', '_meta.txt'), 'w') as f:
f.write(metadata_text)'''
def main():
for i, text_corpus_path in enumerate(text_corpus_paths):
save_vectorized_text(text_corpus_path, vector_corpus_paths[i])
if __name__ == '__main__':
main()
'''
this script splits the reports (.txt files) located in the
--path_to_reports folder into description-, diagnosis- and end-section
and saves these parts in --target_folder_path (as -txt files).
In addition, a pandas dataframe (--df_cases_file) is then generated in
which it is saved which 3 report-sections belong together. The df_cases
dataframe is also there to label the reports with different labelsets.
Also pass --author_names (space separated names, cased) to label the reports by found
authors (stored in df_cases)
'''
import sys
import glob
from tqdm import tqdm
import pandas as pd
from database_preparation.reportPreparationTools import get3parts
import os, shutil
import argparse
from database_preparation.label_reports_with_authors import add_author_labels_to_df_cases
sys.path.append(os.getcwd())
# parse arguments:
sys.path.append(os.getcwd())
parser = argparse.ArgumentParser()
parser.add_argument("--path_to_reports",
default='../DataNephroTexts/reports')
parser.add_argument("--target_folder_path",
default='../DataNephroTexts')
parser.add_argument("--df_cases_file",
default='database/df_cases.pkl')
parser.add_argument("--author_names",
default="Name1 Name2")
parser.add_argument("--text_encoding",
default="latin-1")
parser.add_argument("--use_newest_reports", action='store_true')
args = parser.parse_args()
# %% get all files
# get the primary reports, which histonums are ending wit .0:
report_file_list = glob.glob(args.path_to_reports + '/*0.txt')
# get all last reports (newest ones)
if args.use_newest_reports:
print("using newest reports (if there is a H.123.0 and a H.123.2, take H.123.2)")
for idx in tqdm(range(0, len(report_file_list))):
t_file_name = report_file_list[idx]
report_file_list[idx] = glob.glob(t_file_name[0:-5] + "*.txt")[-1]
else:
print("using oldes reports (if there is a H.123.0 and a H.123.2, take H.123.0)")
def make_folder(dir):
shutil.rmtree(dir, ignore_errors=True)
os.makedirs(dir)
make_folder(args.target_folder_path + "/description")
make_folder(args.target_folder_path + "/diagnosis")
make_folder(args.target_folder_path + "/end")
make_folder(args.target_folder_path + "/short_diagnosis")
print(f"Splitting reports of corpus {args.path_to_reports} into description-, diagnosis- and end-sections...")
# %% iterate over all files
error_file_list = []
no_error_file_list = []
lst_description, lst_diagnose, lst_end = [], [], []
for idx, t_file in tqdm(enumerate(report_file_list)):
try:
txt_micro, txt_conclusion, end = get3parts(t_file)
no_error_file_list.append(t_file)
except:
#print(f"skipped file {t_file}. Could not split the text into description and diagnosis-part")
error_file_list.append(t_file)
continue
with open(args.target_folder_path + "/description/description#" + str(idx) + ".txt", "w",
encoding=args.text_encoding) as text_file:
text_file.write(txt_micro)
with open(args.target_folder_path + "/diagnosis/diagnosis#" + str(idx) + ".txt", "w",
encoding=args.text_encoding) as text_file:
text_file.write(txt_conclusion)
with open(args.target_folder_path + "/end/end#" + str(idx) + ".txt", "w",
encoding=args.text_encoding) as text_file:
if end == None:
end = "None"
text_file.write(end)
lst_description.append('description#' + str(idx) + ".txt")
lst_diagnose.append('diagnosis#' + str(idx) + ".txt")
lst_end.append('end#' + str(idx) + ".txt")
#save skipped reports:
with open(args.target_folder_path + "/failed_to_split_list.txt", "w") as text_file:
text_file.write('\n'.join(error_file_list))
# print infos:
print(" ===> finished < === ")
print(f"skipped {len(error_file_list)} reports "
f"({round(len(error_file_list)/len(report_file_list)*100,1)}%), since text splitting "
f"failed (see {args.target_folder_path + '/failed_to_split_list.txt'}).")
processed_docs = len(lst_description)
print(f"saved {processed_docs} description sections at {args.target_folder_path + '/description'}")
print(f"saved {processed_docs} diagnosis sections at {args.target_folder_path + '/diagnosis'}")
print(f"saved {processed_docs} end sections at {args.target_folder_path + '/end'}")
# create and save df_cases file, which is also used to save the labels (=cluster-indices) for each text.
df = pd.DataFrame(list(zip(lst_description, lst_diagnose, lst_end)),
columns=['description_text_files', 'diagnosis_text_files', 'end_text_files'])
df.to_pickle(args.df_cases_file)
# search for authors in end-sections in order to add them as labels to the df_cases file:
try:
add_author_labels_to_df_cases(args.target_folder_path + '/end', args.author_names.split(' '), args.df_cases_file)
except:
print("label the reports with authors failed.")
df = pd.read_pickle(args.df_cases_file)
print(f"saved df_cases at {args.df_cases_file}\n")
print()
import nltk
from database_preparation.utils_wordbase import RegexpReplacer
lst_stopwords_patho = ['Ca.', 'Ca', 'ca.', 'ca' "Circa", "circa"]
#there are a view words in the german nltk.coprus.stopwords-list which we might keep!
lst_stopwords_to_keep = ['keiner', 'keinen', 'keines', 'keinem', 'keine', 'kein']
def filter_stopwords(tokinzed_txt, additional_stop_words = None):
'''
- passed text list has to be tokenized!
'''
stop_words = nltk.corpus.stopwords.words('german')
stop_words.extend(lst_stopwords_patho)
if type(additional_stop_words) == list:
stop_words.extend(additional_stop_words)
replacer = RegexpReplacer()
for word in lst_stopwords_to_keep:
stop_words.remove(word)
new_stopwords = []
for word in stop_words: # add replaced words to stoplist:
new_word = replacer.replace(word)
if word != new_word:
new_stopwords.append(new_word)
stop_words.extend(new_stopwords)
return [w for w in tokinzed_txt if not w.lower() in stop_words]
\ No newline at end of file
import math
import pandas as pd
import os
import sys
sys.path.append(os.getcwd())
import datasets
import pyarrow as pa
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import pickle
import scipy
import random
from tqdm import tqdm
path2textfiles = "../DataNephroTexts/input/"
path2diagnosefiles = "../DataNephroTexts/label/"
def is_text_lst_tokenized(path2corpus):
try:
text_lst = pd.read_pickle(path2corpus)
text1 = np.asarray(text_lst[0])
return bool(text1.ndim)
except:
return False
def is_text_lst_tfidf_vectorized(path2corpus):
try:
with open(path2corpus, 'rb') as f:
loaded_texts = pickle.load(f)
return type(loaded_texts) == scipy.sparse.csr.csr_matrix
except:
return False
def text_label_2_labeled_dataset(texts, unfiltered_labels, print_infos=False):
'''
- sorts out outliear-documents (which belongs to cluster .1 or cluster Nonde)
- converts the passed text-label pair to datastes.Dataset type.
- returns dataset in format: {"text": labeled_texts, "label": labels}
'''
# collect all text-label pairs, skipping unvalid labels
labeled_texts = []
labels = []
skipped_labels = 0
# throw out invalid labels:
for i, l in enumerate(unfiltered_labels):
try:
label = int(l)
if label < 0:
skipped_labels += 1
continue
except:
skipped_labels += 1
continue
labels.append(label)
labeled_texts.append(texts[i])
if print_infos:
print("skipped " + str(skipped_labels))
labels = label_list_as_int_list(labels)
# convert it to a hf_dataset, that we can use our tools:
df = pd.DataFrame({"text": labeled_texts, "label": labels})
return datasets.Dataset(pa.Table.from_pandas(df))
def text_label_files_to_labeled_dataset(label_set,
path2corpus="./database/bow_prepro_desc.pkl",
df_cases_path="./database/df_cases.pkl", print_infos=False):
'''
- sorts out outliear-documents (which belongs to cluster .1 or cluster Nonde)
- converts the pandas dataframe to datastes.Dataset type.
'''
df_cases = pd.read_pickle(df_cases_path)
texts = pd.read_pickle(path2corpus)
unfiltered_labels = df_cases["label_" + label_set]
return text_label_2_labeled_dataset(texts, unfiltered_labels, print_infos)
def get_all_label_set_ids():
df = pd.read_pickle("./database/df_cases.pkl")
return [e[6:] for e in df.columns if "label_" in e]
def get_filename_label_tuple(label_set, get_micro_txt=True, df_cases_file="./database/df_cases.pkl"):
'''
returns textfilename_list, label_lists as ([filenames],[labels, as int list]))
it will contain outlier labels (they have value None or -1)
'''
df_cases = pd.read_pickle(df_cases_file)
if "label_" + label_set not in df_cases.columns:
raise ValueError("label set " + label_set + " does not exist in df_cases!")
return None
# convert labels to integers:
int_labels = label_list_as_int_list(df_cases["label_" + label_set])
if get_micro_txt:
return df_cases["description_text_files"], int_labels
else:
return df_cases["diagnosis_text_files"], int_labels
def get_amount_unique_labels(label_set, df_cases_file="./database/df_cases.pkl"):
'''
returns amount unique labels (does not count nan or -1 classes!!!).
If label_set does not exist, you will get
an error. If so, run generate_save_hf_dataset(...) to generate a labeled dataset
of type datasets.Dataset (datasets is a library from huggingface)
'''
df_cases = pd.read_pickle(df_cases_file)
if "label_" + label_set not in df_cases.columns:
raise ValueError("label set " + label_set + " does not exist in df_cases!")
return None
# convert labels to integers:
labels = label_list_as_int_list(df_cases["label_" + label_set])
has_none_labels = False
for label in labels:
if label == -1 or np.isnan(label) or label == None:
has_none_labels = True
return len(list(set(labels))) - 1
return len(list(set(labels)))
def get_amount_reports(label_set):
'''
returns amount of reports which have a valid label (excluding -1 and NaN values)
'''
# train_test_dataset = load_labeled_dataset(label_set)
# return len(train_test_dataset["label"])
text, labels = get_filename_label_tuple(label_set)
return len([l for l in labels if l >= 0])
def generate_save_hf_dataset(label_set="LDA", overwrite=True, lower=False):
'''
Generate a labeled dataset of type datasets.Dataset
(datasets is a library from huggingface)
and saves it under "./database/labeled_dataframes/labeld_dataset_" + label_set
'''
dataset_path = "./database/labeled_dataframes/labeld_dataset_" + label_set
if os.path.exists(dataset_path):
print(dataset_path + " already exists.")
if overwrite:
print("generating it new and overwrite " + dataset_path)
else:
print("skipping generation of " + dataset_path)
return
df_cases = pd.read_pickle("./database/df_cases.pkl")
# print(df_cases.columns)
# collect all text-label pairs, skipping unvalid labels!
diag_text_rokenized = pd.read_pickle("./database/diag_lst_tokenized.pkl")
texts = []
labels = []
diagnoses = []
skipped_labels = 0
# throw out invalid labels:
print("creating " + dataset_path)
for i, l in enumerate(df_cases["label_" + label_set]):
try:
label = int(l)
if label < 0:
skipped_labels += 1
continue
except:
skipped_labels += 1
continue
labels.append(label)
file_id = df_cases["description_text_files"][i]
with open(path2textfiles + file_id, 'r') as f:
if lower:
texts.append(f.read().lower())
else:
texts.append(f.read())
file_id = df_cases["diagnosis_text_files"][i]
with open(path2diagnosefiles + file_id, 'r') as f:
if lower:
diagnoses.append(f.read().lower())
else:
diagnoses.append(f.read())
print("skipped " + str(skipped_labels) + " labels")
# convert to dataframe:
df = pd.DataFrame({
'text': texts,
'label': labels,
'diagnose': diagnoses
})
# convert pandas dataframe to huggingface dataset:
hf_dataset = datasets.Dataset(pa.Table.from_pandas(df))
'''# how to create a DatasetDict:
test_split_length = 100
hf_data_dict = datasets.DatasetDict({"train": datasets.Dataset(pa.Table.from_pandas(df[test_split_length:])),
"test": datasets.Dataset(pa.Table.from_pandas(df[:test_split_length])),
"unsupervised": hf_dataset})
hf_data_dict.save_to_disk(dataset_path)'''
# print("shape of " + dataset_path + ":")
# print(hf_dataset)
hf_dataset.save_to_disk(dataset_path)
def label_list_as_int_list(labels):
'''
converts a label list to a list of integers,
regardles if its a list of floats or strings
'''
int_labels = []
for i, l in enumerate(labels):
try:
int_labels.append(int(labels[i]))
except:
int_labels.append(-1)
return int_labels
def get_splits_for_cross_val(dataset, fold_amount=10, stratified=True,
merge_classes=None, oversample=False):
'''
dataset should be sth which can be accessed via dataset['text'] and dataset['label']
returns splits for k-fold-cross-validation as datasets.Dataset type
with sth like merge_classes=[(0,1),(2,3,4)] you can merge the indexted classes to one class
'''
if merge_classes is not None:
new_labels = [i for i in dataset['label']]
for classes_to_merge in merge_classes:
new_class_name = classes_to_merge[0]
for c in classes_to_merge[1:]:
for i, label in enumerate(dataset['label']):
if int(label) == int(c):
new_labels[i] = new_class_name
dataset = text_label_2_labeled_dataset(dataset['text'], new_labels)
if oversample:
dataset = simple_oversampling(dataset)
if stratified:
skf = StratifiedKFold(n_splits=fold_amount, random_state=None, shuffle=False)
for train_index, test_index in skf.split(dataset['text'], dataset['label']):
yield dataset[train_index], dataset[test_index]
else:
folds = KFold(n_splits=fold_amount, shuffle=False)
for train_index, test_index in folds.split(list(range(len(dataset)))):
yield dataset[train_index], dataset[test_index]
def simple_oversampling(dataset):
print("oversampling (without augmentation!)...")
unique_labels = np.unique(dataset['label'])
label_amount = [0 for x in range(len(unique_labels))]
texts = []
for i, l in enumerate(tqdm(unique_labels)):
i_th_labels = dataset['label'] == l
label_amount[i] = int(np.sum(i_th_labels))
texts.append([dataset['text'][i] for i,label in enumerate(dataset['label']) if label == l])
max_index = label_amount.index(max(label_amount))
for i, l in enumerate(tqdm(unique_labels)):
if i == max_index:
continue
amount_copies = label_amount[max_index] - label_amount[i]
for x in range(amount_copies):
new_element = {'label': l, 'text': random.choice(texts[i])}
dataset = dataset.add_item(new_element)
return dataset
def main():
# args = argsparse_preamble()
# generate_save_hf_dataset(args.clustered_data)
# print label sets
label_sets = get_all_label_set_ids()
print(label_sets)
# dirty fix of OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# plot histograms: how much docs do have the same label=cluster-index?
for i, label_set in enumerate(label_sets):
text, labels = get_filename_label_tuple(label_set)
labels = np.asarray(label_list_as_int_list(labels))
# plt.subplot(3, 3, i + 1)
plt.close()
label_num = get_amount_unique_labels(label_set)
x = np.arange(label_num)
h, b = np.histogram(labels, bins=label_num)
plt.bar(x, height=h)
plt.xticks(x, x)
plt.title(label_set)
plt.savefig("TextClustering/plots/" + label_set + "_histogram.png")
if __name__ == '__main__':
main()
import codecs
import nltk
from nltk.probability import FreqDist
def read_german_text(filename):
textfile = codecs.open(filename, 'br', "latin-1")
text= textfile.read()
textfile.close()
return text
#%%
def get_most_frequent_words(text, n_words = 10):
from nltk.probability import FreqDist
tokenizer = nltk.RegexpTokenizer(r"\w+")
word_list = tokenizer.tokenize(text)
freq_dist = FreqDist(word_list)
top_words = freq_dist.most_common(n_words)
top_words = [word[0] for word in top_words]
return top_words
#%%
def get_most_discriminant_words(text, text_vectorizer, text_counterizer, n_words = 10):
#%% get the vector
if not text:
top_words = []
return top_words
tf_idf_vector = text_vectorizer.transform(text_counterizer.transform(text))
#%% define the coo-function
def sort_coo(coo_matrix):
tuples = zip(coo_matrix.col, coo_matrix.data)
tuples = set(tuples)
tuples = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
return tuples
#%% define the sort function
def extract_topn_from_vector(feature_names, sorted_items, n_words):
"""get the feature names and tf-idf score of top n items"""
# use only topn items from vector
sorted_items = sorted_items[:n_words]
score_vals = []
feature_vals = []
# word index and corresponding tf-idf score
for idx, score in sorted_items:
# keep track of feature name and its corresponding score
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
# create a tuples of feature,score
# results = zip(feature_vals,score_vals)
results = {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]] = score_vals[idx]
return results
#%% sort the results
sorted_items = sort_coo(tf_idf_vector.tocoo())
# extract only the top n; n here is 10
feature_names = text_counterizer.get_feature_names()
keywords = extract_topn_from_vector(feature_names, sorted_items, n_words)
top_words = list(keywords.keys())
return top_words
#%%
def regexp(pattern, text):
import re
index_start, index_stop = [], []
value = []
for match in re.finditer(pattern, text):
index_start.append(match.start())
index_stop.append(match.end())
value.append(match.group())
return index_start, index_stop
\ No newline at end of file
#%% replace words
import re
replacement_patterns = [
('ittelgradig', 'äßiggradig'),
('ittelschwer', 'äßiggradig'),
('Tubulusepithelschädigung', 'Tubulusepithelschaden'),
('prärenalen', 'prärenale'),
('reversibler', 'reversibel'),
('max.', 'maximal'),
('min.', 'minimal')
]
''' ('ae', 'ä'),
('oe', 'ö'),
('ue', 'ü'),
('Ae', 'Ä'),
('Oe', 'Ö'),
('Ue', 'Ü'),'''
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl) in
patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
#%% combine words
from nltk.tokenize import MWETokenizer
combinedword_patterns = [
('tubulointerstitieller', 'Schaden'),
('prärenalen', 'Genese'),
('prärenale', 'Genese'),
('potentiell', 'reversibler'),
('potentiell', 'reversibel'),
('akuter', 'Tubulusepithelschaden'),
('Lupus', 'Nephritis'),
('diabetische', 'Nephropathie'),
('diabetische', 'Glomerulosklerose'),
('0', '%'),
('1', '%'),
('2', '%'),
('3', '%'),
('4', '%'),
('5', '%'),
('6', '%'),
('7', '%'),
('8', '%'),
('9', '%'),
('tubulointerstitielle', 'Schaedigung'),
('segmentale', 'Glomerulosklerose'),
('globale', 'Glomerulosklerose'),
('potentiell', 'reversible'),
('reversible', 'Tubulusepithelschaden'),
('fokal', 'globale'),
('fokal', 'segmentale'),
('interstitielle', 'Nephritis'),
('proliferierende', 'Glomerulonephritis'),
('segmental', 'nekrotisierende')
]
class RegexpSynonyms(object):
def __init__(self):
self.patterns = combinedword_patterns
def combine(self, text):
mwe_tokenizer = MWETokenizer(self.patterns)
s = mwe_tokenizer.tokenize(text)
return s
from sys import platform
from os.path import expanduser
from googletrans import Translator # use pip install googletrans==3.1.0a0, 3.0 version is broken
import os
# some words which are usually gets translated wrong (from ger to eng):
custom_translation = {'klasse': 'class', 'nih': 'nih', 'leicht': 'minor', 'miterfasst': 'registered',
'gesamtzahl': 'total amount', 'hinweis': 'hint', 'unauffällig': 'unremarkable',
'weitgehend': 'mainly', 'leichtgradiger': 'mild', 'mäßiggradiger': 'a moderate',
'-nih': '-nih', 'bekannt': 'known', 'anschließend': 'followed_by',
'vorbehaltlich': 'for_now', 'mittels': 'using', 'teils': 'partly'}
def make_directory(path):
if not os.path.isdir(path):
os.makedirs(path)
def translate_to(text, src='de', to='en'):
translator = Translator()
return translator.translate(text, src=src, dest=to).text
def main():
pass
if __name__ == '__main__':
main()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment