Commit bc5855c7 authored by max-laptop's avatar max-laptop

updated topic-modeling-analysis.py

parent d3fb7ac6
......@@ -17,6 +17,10 @@ import pandas as pd
'''
# installed: nltk, Hanta, tqdm, numpy
todo: add custom preprocessing for short diagnose texts:
- replace: [('\n', ' '), ('DMGS', 'DM GS'), ('FGFSGS', 'FG FSGS'), ('-', ' ')]
- remove: ['(schner Fall)', 'mit', 'bei', 'nach', 'wohl', 'und']
-
'''
########## define enums ##########
......
......@@ -3,21 +3,25 @@ import joblib
import topicwizard
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline
import pandas as pd
'''
isntallation:
pip install topic-wizard
'''
# params:
path2corpus = "data/bow_short_diag/bow_short_diag.df.pkl"
path2corpus = "data/bow_diag_clustering/bow_diag.df.pkl"
if __name__ == '__main__':
vectorizer = CountVectorizer(min_df=5, max_df=0.8, stop_words="english")
vectorizer = CountVectorizer(min_df=5, max_df=0.8)
model = NMF(n_components=10)
topic_pipeline = make_pipeline(vectorizer, model)
corpus_df = pd.read_pickle(path2corpus)
corpus_dict = {case_id: corpus_df.loc[corpus_df['case_id'] == case_id, 'preprocessed_text'].values[0] for case_id in
corpus_df['case_id']}
corpus = [' '.join(report) for report in corpus_df['preprocessed_text'].tolist()]
topic_pipeline.fit(corpus)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment