Till Fischer commited on
Commit
8aac46d
·
1 Parent(s): 564a8b1

Clean commit ohne Tokens

Browse files
analyze_aspects.py CHANGED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # analyze_aspects.py
3
+
4
+ #python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
5
+ # python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
6
+
7
+ import sqlite3
8
+ import argparse
9
+ import logging
10
+ from pathlib import Path
11
+ import nltk
12
+ from transformers import pipeline
13
+ from collections import defaultdict
14
+ import matplotlib.pyplot as plt
15
+
16
+
17
+ def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
18
+ output_dir.mkdir(parents=True, exist_ok=True)
19
+
20
+ aspects = list(aspect_results.keys())
21
+ avg_scores = [sum(scores) / len(scores) for scores in aspect_results.values()]
22
+ colors = ['green' if score > 0.1 else 'red' if score < -0.1 else 'gray' for score in avg_scores]
23
+
24
+ plt.figure(figsize=(10, 6))
25
+ bars = plt.barh(aspects, avg_scores, color=colors)
26
+ plt.axvline(x=0, color='black', linewidth=0.8)
27
+ plt.xlabel("Durchschnittlicher Sentiment-Score")
28
+ plt.title("Sentiment-Analyse pro Aspekt")
29
+
30
+ for bar, score in zip(bars, avg_scores):
31
+ plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2,
32
+ f"{score:.2f}", va='center')
33
+
34
+ plt.tight_layout()
35
+ plt.gca().invert_yaxis()
36
+
37
+ output_path = output_dir / filename
38
+ plt.savefig(output_path, dpi=300)
39
+ plt.close()
40
+
41
+ logger.info(f"Diagramm gespeichert unter: {output_path}")
42
+
43
+
44
+ # NLTK punkt model for sentence tokenization
45
+ nltk.download('punkt')
46
+ from nltk import sent_tokenize
47
+
48
+ # Logging Configuration
49
+ def configure_logging():
50
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
51
+ return logging.getLogger(__name__)
52
+
53
+ logger = configure_logging()
54
+
55
+ # Aspekt-Label-Maps
56
+ ASPECT_LABEL_MAP = {
57
+ "Handlung": ["Handlung", "Plot", "Story", "Aufbau"],
58
+ "Charaktere": ["Charaktere", "Figuren", "Protagonisten", "Nebenfiguren", "Beziehungen"],
59
+ "Stil": ["Stil", "Sprachstil", "Sprache", "Erzählweise"],
60
+ "Emotionale Wirkung": ["Lesevergnügen", "Berührend", "Bewegend", "Begeisternd", "Spannend"],
61
+ "Tiefgang": ["Tiefgang", "Nachdenklich", "Philosophisch", "kritisch"],
62
+ "Thema & Kontext": ["Thema", "Motiv", "Zeitgeschehen", "Historischer Kontext", "Gesellschaft"],
63
+ "Originalität": ["Originalität", "Kreativität", "Innovativ", "Idee", 'Humor'],
64
+ "Recherche & Authentizität": ["Recherche", "Authentizität", "Realismus", "Fakten"]
65
+ }
66
+
67
+ ASPECT_LABEL_MAP_EN = {
68
+ "Plot": ["Plot", "Story", "Narrative", "Structure"],
69
+ "Characters": ["Characters", "Protagonists", "Antagonists", "Relationships"],
70
+ "Style": ["Style", "Language", "Tone", "Narration"],
71
+ "Emotional Impact": ["Touching", "Funny", "Exciting", "Moving", "Engaging"],
72
+ "Depth": ["Philosophical", "Thought-provoking", "Insightful", "Critical"],
73
+ "Theme & Context": ["Theme", "Motif", "Historical Context", "Social Issues"],
74
+ "Originality": ["Originality", "Creativity", "Innovation", "Idea"],
75
+ "Research & Authenticity": ["Research", "Authenticity", "Realism", "Facts"]
76
+ }
77
+
78
+ ALL_LABELS = [label for labels in ASPECT_LABEL_MAP.values() for label in labels]
79
+
80
+
81
+ # --- Datenbankzugriff ---
82
+
83
+ def load_reviews(db_path: Path, isbn: str) -> list:
84
+ conn = sqlite3.connect(db_path)
85
+ cursor = conn.cursor()
86
+ cursor.execute(
87
+ "SELECT id, cleaned_text, cleaned_text_en FROM reviews_und_notizen WHERE buch_isbn = ?",
88
+ (isbn,)
89
+ )
90
+ rows = cursor.fetchall()
91
+ conn.close()
92
+ texts_to_analyze = []
93
+ for review_id, text_de, text_en in rows:
94
+ if text_de and isinstance(text_de, str):
95
+ texts_to_analyze.append((review_id, text_de, 'de'))
96
+ if text_en and isinstance(text_en, str):
97
+ texts_to_analyze.append((review_id, text_en, 'en'))
98
+ return texts_to_analyze
99
+
100
+
101
+ # --- Analysefunktion ---
102
+
103
+ def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list[str] = ["de", "en"]) -> dict:
104
+ reviews = load_reviews(db_path, isbn)
105
+ reviews = [r for r in reviews if r[2] in languages]
106
+ if not reviews:
107
+ logger.warning(f"Keine gesäuberten Reviews für ISBN {isbn} in den gewählten Sprachen gefunden.")
108
+ return {}
109
+
110
+ zsl = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device, multi_label=True)
111
+ sent_de = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert", device=device)
112
+ sent_en = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)
113
+
114
+ aspect_results = defaultdict(list)
115
+ total_aspects = 0
116
+
117
+ for review_id, text, lang in reviews:
118
+ if not text:
119
+ continue
120
+
121
+ logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")
122
+ sentences = sent_tokenize(text, language='german' if lang == 'de' else 'english')
123
+
124
+ if lang == 'de':
125
+ aspect_map = ASPECT_LABEL_MAP
126
+ all_labels = ALL_LABELS
127
+ sent_pipeline = sent_de
128
+ hypothesis_template = "Dieser Satz handelt von {}."
129
+ elif lang == 'en':
130
+ aspect_map = ASPECT_LABEL_MAP_EN
131
+ all_labels = [label for labels in aspect_map.values() for label in labels]
132
+ sent_pipeline = sent_en
133
+ hypothesis_template = "This sentence is about {}."
134
+ else:
135
+ continue
136
+
137
+ for sent in sentences:
138
+ if not sent.strip() or len(sent) < 15:
139
+ continue
140
+
141
+ result = zsl(sent, candidate_labels=all_labels, hypothesis_template=hypothesis_template)
142
+
143
+ main_label = ""
144
+ best_score = 0.0
145
+ for label, score in zip(result["labels"], result["scores"]):
146
+ if score > 0.8:
147
+ main_label = next((k for k, v in aspect_map.items() if label in v), label)
148
+ best_score = score
149
+ break
150
+
151
+ if not main_label:
152
+ continue
153
+
154
+ ml_sentiment = sent_pipeline(sent)[0]
155
+ ml_score = ml_sentiment['score'] if ml_sentiment['label'].upper().startswith('POS') else -ml_sentiment['score']
156
+ final_score = ml_score
157
+ final_label = 'POS' if final_score > 0.1 else 'NEG' if final_score < -0.1 else 'NEU'
158
+
159
+ print(
160
+ f"Review {review_id} ({lang}) | Satz: {sent}\n"
161
+ f" Aspekt: {main_label} (via '{result['labels'][0]}', {best_score:.2f}) | "
162
+ f"ML: {ml_sentiment['label']}({ml_sentiment['score']:.2f}) -> Final: {final_label}({final_score:.2f})"
163
+ )
164
+
165
+ aspect_results[main_label].append(final_score)
166
+ total_aspects += 1
167
+
168
+ logger.info(f"Total aspects found: {total_aspects}")
169
+ return aspect_results
170
+
171
+
172
+ # --- Entry Point ---
173
+
174
+ def main():
175
+ parser = argparse.ArgumentParser(description="Quick-Win ABSA ohne SentiWS")
176
+ parser.add_argument("--db-path", required=True, help="Pfad zur SQLite-Datenbank")
177
+ parser.add_argument("--isbn", required=True, help="ISBN des Buchs")
178
+ parser.add_argument("--gpu", action="store_true", help="GPU verwenden (device=0)")
179
+ parser.add_argument("--languages", nargs="+", choices=["de", "en"], default=["de", "en"],
180
+ help="Sprachen der Reviews, z. B. --languages de oder --languages de en")
181
+ args = parser.parse_args()
182
+
183
+ device = 0 if args.gpu else -1
184
+ aspect_results = analyze_quickwin(
185
+ Path(args.db_path), args.isbn,
186
+ device=device,
187
+ languages=args.languages
188
+ )
189
+
190
+ if aspect_results:
191
+ output_dir = Path("output")
192
+ visualize_aspects(aspect_results, output_dir)
193
+ else:
194
+ logger.info("Keine Aspekt-Daten zur Visualisierung verfügbar.")
aspect-sentiment-analyzer/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
aspect-sentiment-analyzer/README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Aspect Sentiment Analyzer
3
+ emoji: 🌖
4
+ colorFrom: yellow
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 5.34.2
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference