from transformers import AutoModelForMaskedLM, AutoTokenizer import pickle # script parameters: modelname = "bert-base-german-cased" path2corpus_embedding_preprocessed_diagnosis = 'data/embedding_prepro_diag.pkl' path2corpus_embedding_preprocessed_description = 'data/embedding_prepro_desc.pkl' tokenizer = AutoTokenizer.from_pretrained(modelname) model = AutoModelForMaskedLM.from_pretrained(modelname) unknown_id = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) with open(path2corpus_embedding_preprocessed_description, 'rb') as f: micro_texts = pickle.load(f) with open(path2corpus_embedding_preprocessed_diagnosis, 'rb') as f: diag_texts = pickle.load(f) def find_oov_cases(texts): oov_cases = 0 for text_num, text in enumerate(texts): if unknown_id in tokenizer.encode(text): tokens = text.split(" ") for i, token in enumerate(tokens): if unknown_id in tokenizer.encode(token): oov_cases += 1 print("found OOV case in text " + str(text_num)) print("the word \'" + str(token) + "\' in " + str(tokens[i - 2:i + 2]) + " is OOV") return oov_cases oov_sum = find_oov_cases(micro_texts) + find_oov_cases(diag_texts) print("\nFinished. Found " + str(oov_sum) + " OOV cases (see above).")