updated topic-modeling-analysis.py

bc5855c7 · max-laptop · d3fb7ac6 · bc5855c7 · bc5855c7
Commit bc5855c7 authored Feb 18, 2025 by max-laptop
Show whitespace changes
Inline Side-by-side

Showing with 14 additions and 6 deletions

preprocess.py database_preparation/preprocess.py +4 -0

topic-modeling-analysis.py topic_modeling/topic-modeling-analysis.py +10 -6

No files found.
--- a/database_preparation/preprocess.py
+++ b/database_preparation/preprocess.py
@@ -17,6 +17,10 @@ import pandas as pd
 '''
 # installed: nltk, Hanta, tqdm, numpy

+todo: add custom preprocessing for short diagnose texts:
+- replace: [('\n', ' '), ('DMGS', 'DM GS'), ('FGFSGS', 'FG FSGS'), ('-', ' ')]
+- remove: ['(schner Fall)', 'mit', 'bei', 'nach', 'wohl', 'und']
+- 
 '''

 ########## define enums ##########

--- a/topic_modeling/topic-modeling-analysis.py
+++ b/topic_modeling/topic-modeling-analysis.py
@@ -3,21 +3,25 @@ import joblib
 import topicwizard
 from sklearn.decomposition import NMF
 from sklearn.pipeline import make_pipeline
+import pandas as pd

-'''
-isntallation:
-pip install topic-wizard

-'''
+
+# params:
+path2corpus = "data/bow_short_diag/bow_short_diag.df.pkl"
+path2corpus = "data/bow_diag_clustering/bow_diag.df.pkl"

 if __name__ == '__main__':
-    vectorizer = CountVectorizer(min_df=5, max_df=0.8, stop_words="english")
+    vectorizer = CountVectorizer(min_df=5, max_df=0.8)

    model = NMF(n_components=10)

    topic_pipeline = make_pipeline(vectorizer, model)

-
+    corpus_df = pd.read_pickle(path2corpus)
+    corpus_dict = {case_id: corpus_df.loc[corpus_df['case_id'] == case_id, 'preprocessed_text'].values[0] for case_id in
+                   corpus_df['case_id']}
+    corpus = [' '.join(report) for report in corpus_df['preprocessed_text'].tolist()]

    topic_pipeline.fit(corpus)