ctizzzy0 commited on
Commit
1c4706d
Β·
verified Β·
1 Parent(s): f966e89

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +672 -0
app.py ADDED
@@ -0,0 +1,672 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # CollegeGenius HF β€” 100% Hugging Face, No External APIs
3
+ # ------------------------------------------------------
4
+ # Everything runs inside the Space using open models from the Hugging Face Hub.
5
+ # No third‑party web APIs or keys required.
6
+ #
7
+ # Models:
8
+ # - sentence-transformers/paraphrase-MiniLM-L6-v2 (embeddings)
9
+ # - textattack/roberta-base-CoLA (grammar acceptability per sentence)
10
+ # - sshleifer/distilbart-cnn-12-6 (summarization / light rewriting)
11
+ # - j-hartmann/emotion-english-distilroberta-base (emotion for interviews)
12
+ # - distilroberta-base (masked LM for resume verb suggestions)
13
+ #
14
+ # Launch config is at bottom: demo.queue(...).launch()
15
+ # ------------------------------------------------------
16
+
17
+ import os
18
+ import re
19
+ import io
20
+ import json
21
+ import uuid
22
+ import random
23
+ import datetime as dt
24
+ from typing import List, Dict, Any
25
+
26
+ import numpy as np
27
+ import pandas as pd
28
+ import matplotlib.pyplot as plt
29
+
30
+ import gradio as gr
31
+ from fpdf import FPDF
32
+
33
+ # Optional OCR (local only)
34
+ try:
35
+ import easyocr
36
+ _OCR = easyocr.Reader(['en'], gpu=False)
37
+ except Exception:
38
+ _OCR = None
39
+
40
+ # Hugging Face models
41
+ from transformers import (
42
+ AutoTokenizer,
43
+ AutoModelForSequenceClassification,
44
+ AutoModelForSeq2SeqLM,
45
+ AutoModelForMaskedLM,
46
+ pipeline,
47
+ set_seed,
48
+ )
49
+ from sentence_transformers import SentenceTransformer
50
+
51
+ APP_DIR = "cghf_data"
52
+ os.makedirs(APP_DIR, exist_ok=True)
53
+
54
+ LOG_CSV = os.path.join(APP_DIR, "runs.csv")
55
+ if not os.path.exists(LOG_CSV):
56
+ pd.DataFrame(columns=["timestamp","tool","score","meta"]).to_csv(LOG_CSV, index=False)
57
+
58
+ set_seed(42)
59
+
60
+ # -----------------------------
61
+ # Load models
62
+ # -----------------------------
63
+ _EMBEDDER = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
64
+
65
+ # Grammar acceptability (CoLA)
66
+ _CoLA_tok = AutoTokenizer.from_pretrained("textattack/roberta-base-CoLA")
67
+ _CoLA = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-CoLA")
68
+ _acc_pipe = pipeline("text-classification", model=_CoLA, tokenizer=_CoLA_tok, return_all_scores=True)
69
+
70
+ # Summarizer / lightweight rewriting
71
+ _SUM_tok = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
72
+ _SUM = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
73
+ _sum_pipe = pipeline("summarization", model=_SUM, tokenizer=_SUM_tok)
74
+
75
+ # Interview emotion
76
+ _EMO_tok = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
77
+ _EMO = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
78
+ _emo_pipe = pipeline("text-classification", model=_EMO, tokenizer=_EMO_tok, top_k=None)
79
+
80
+ # Masked LM for resume suggestions
81
+ _MLM_tok = AutoTokenizer.from_pretrained("distilroberta-base")
82
+ _MLM = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
83
+
84
+ # -----------------------------
85
+ # Small built-in college dataset
86
+ # -----------------------------
87
+ COLLEGES = pd.DataFrame([
88
+ ("MIT","MA",3.9,1550,35,0.04,5,4),
89
+ ("Stanford","CA",3.95,1540,35,0.04,5,5),
90
+ ("Harvard","MA",3.95,1540,35,0.04,5,5),
91
+ ("UC Berkeley","CA",3.85,1470,33,0.15,5,4),
92
+ ("Michigan","MI",3.85,1470,33,0.20,5,4),
93
+ ("Georgia Tech","GA",3.85,1480,34,0.17,5,3),
94
+ ("UT Austin","TX",3.8,1420,32,0.31,4,4),
95
+ ("CMU","PA",3.9,1530,35,0.14,5,4),
96
+ ("Princeton","NJ",3.95,1540,35,0.05,5,5),
97
+ ("Yale","CT",3.95,1540,35,0.05,5,5),
98
+ ("Brown","RI",3.9,1510,34,0.06,4,5),
99
+ ("Duke","NC",3.9,1520,34,0.07,5,5),
100
+ ("Northwestern","IL",3.9,1500,34,0.07,4,5),
101
+ ("NYU","NY",3.7,1450,33,0.13,4,5),
102
+ ("UF","FL",3.8,1400,31,0.31,4,3),
103
+ ("Purdue","IN",3.7,1380,30,0.53,4,3),
104
+ ("Virginia Tech","VA",3.7,1360,30,0.57,4,3),
105
+ ("UW","WA",3.75,1410,32,0.48,4,4),
106
+ ("UIUC","IL",3.75,1420,32,0.45,5,3),
107
+ ("UCLA","CA",3.9,1500,34,0.09,4,5)
108
+ ], columns=["name","state","avg_gpa","sat_mid","act_mid","accept_rate","stem","hum"])
109
+
110
+ # -----------------------------
111
+ # Utils
112
+ # -----------------------------
113
+ def _now():
114
+ return dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
115
+
116
+ def _clean(s: str) -> str:
117
+ if not isinstance(s, str): return ""
118
+ return re.sub(r"\s+"," ",s).strip()
119
+
120
+ def _sentences(text: str) -> List[str]:
121
+ text = _clean(text)
122
+ if not text: return []
123
+ sents = re.split(r"(?<=[.!?])\s+", text)
124
+ return [s for s in sents if s]
125
+
126
+ def _save_log(tool, score, meta):
127
+ df = pd.read_csv(LOG_CSV)
128
+ row = {"timestamp": _now(), "tool": tool, "score": score if score is not None else "", "meta": json.dumps(meta)[:1200]}
129
+ df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
130
+ df.to_csv(LOG_CSV, index=False)
131
+
132
+ def ocr_local(file) -> str:
133
+ if not file: return ""
134
+ path = file.name if hasattr(file, "name") else str(file)
135
+ if path.lower().endswith(".pdf"):
136
+ try:
137
+ from pypdf import PdfReader
138
+ pages = []
139
+ reader = PdfReader(path)
140
+ for p in reader.pages:
141
+ pages.append(p.extract_text() or "")
142
+ t = "\n".join(pages)
143
+ if t.strip(): return _clean(t)
144
+ except Exception:
145
+ pass
146
+ return "PDF text not found. Export as text PDF or upload an image. (No web OCR used.)"
147
+ else:
148
+ if _OCR:
149
+ res = _OCR.readtext(path, detail=0, paragraph=True)
150
+ return _clean(" ".join(res))
151
+ return "OCR not available locally. Install `easyocr` or paste text."
152
+
153
+ # -----------------------------
154
+ # Essay AI (HF-only)
155
+ # -----------------------------
156
+ CLICHES = [
157
+ "ever since i was a child","since the dawn of time","follow my dreams",
158
+ "changed my life forever","made me who i am today","i have always loved",
159
+ "the true meaning","anything is possible","importance of hard work",
160
+ "the power of perseverance"
161
+ ]
162
+
163
+ def acceptability_score(sentence: str) -> float:
164
+ out = _acc_pipe(sentence)[0]
165
+ scores = {o["label"]: o["score"] for o in out}
166
+ return float(scores.get("LABEL_1", 0.0))
167
+
168
+ def essay_grammar_profile(text: str) -> Dict[str, Any]:
169
+ sents = _sentences(text)
170
+ if not sents: return {"avg_acceptability":0, "per_sentence":[]}
171
+ per = [{"sentence": s, "acceptability": round(acceptability_score(s),3)} for s in sents]
172
+ avg = float(np.mean([p["acceptability"] for p in per]))
173
+ return {"avg_acceptability": round(avg,3), "per_sentence": per}
174
+
175
+ def summarize_text(text: str, max_words=120) -> str:
176
+ if not text.strip(): return ""
177
+ chunks = []
178
+ words = text.split()
179
+ step = 350
180
+ for i in range(0, len(words), step):
181
+ chunk = " ".join(words[i:i+step])
182
+ if len(chunk) < 10: continue
183
+ s = _sum_pipe(chunk, max_length=128, min_length=56, do_sample=False)[0]["summary_text"]
184
+ chunks.append(s)
185
+ out = " ".join(chunks) if chunks else _sum_pipe(text, max_length=150, min_length=60, do_sample=False)[0]["summary_text"]
186
+ return " ".join(out.split()[:max_words])
187
+
188
+ def redundancy_map(text: str) -> Dict[str, Any]:
189
+ sents = _sentences(text)
190
+ if len(sents) < 2:
191
+ return {"avg_sim": 0.0, "pairs": []}
192
+ embs = _EMBEDDER.encode(sents, normalize_embeddings=True)
193
+ sims = []
194
+ for i in range(len(sents)-1):
195
+ sim = float(np.dot(embs[i], embs[i+1]))
196
+ sims.append({"pair": (i, i+1), "sim": round(sim,3)})
197
+ avg = float(np.mean([p["sim"] for p in sims])) if sims else 0.0
198
+ return {"avg_sim": round(avg,3), "pairs": sims}
199
+
200
+ def coherence_score(text: str) -> float:
201
+ sents = _sentences(text)
202
+ if len(sents) < 3: return 0.5
203
+ embs = _EMBEDDER.encode(sents, normalize_embeddings=True)
204
+ sims = []
205
+ for i in range(len(sents)-2):
206
+ sims.append(float(np.dot(embs[i], embs[i+2])))
207
+ score = float(np.mean(sims))
208
+ return round((score+1)/2, 3)
209
+
210
+ def cliche_hits(text: str) -> List[str]:
211
+ low = text.lower()
212
+ return [c for c in CLICHES if c in low]
213
+
214
+ def suggest_rewrite(sentence: str, max_len=64) -> str:
215
+ s = sentence.strip()
216
+ if len(s.split()) < 8:
217
+ return s
218
+ try:
219
+ out = _sum_pipe(s, max_length=min(max_len, 72), min_length=12, do_sample=False)[0]["summary_text"]
220
+ return out
221
+ except Exception:
222
+ return s
223
+
224
+ def essay_score(text: str) -> Dict[str, Any]:
225
+ wc = len(re.findall(r"[A-Za-z0-9']+", text))
226
+ grammar = essay_grammar_profile(text)
227
+ redun = redundancy_map(text)
228
+ coher = coherence_score(text)
229
+ clichΓ©s = cliche_hits(text)
230
+ s = (
231
+ 0.35 * grammar["avg_acceptability"] +
232
+ 0.25 * (1 - max(0, redun["avg_sim"] - 0.6)) +
233
+ 0.25 * coher +
234
+ 0.15 * max(0, min(1, 800 - abs(650 - wc)) / 800)
235
+ )
236
+ total = int(round(100 * max(0.0, min(1.0, s))))
237
+ return {
238
+ "score": total,
239
+ "word_count": wc,
240
+ "grammar_avg": grammar["avg_acceptability"],
241
+ "redundancy_avg": redun["avg_sim"],
242
+ "coherence": coher,
243
+ "cliches": clichΓ©s,
244
+ "per_sentence": grammar["per_sentence"]
245
+ }
246
+
247
+ def essay_pipeline(prompt, essay_text, upload):
248
+ src = _clean(essay_text or "")
249
+ if (not src) and upload is not None:
250
+ src = ocr_local(upload)
251
+ if not src or src.startswith("OCR not available") or src.startswith("PDF text not found"):
252
+ return src or "No text found.", None, None, None
253
+
254
+ profile = essay_score(src)
255
+ summary = summarize_text(src)
256
+ worst = sorted(profile["per_sentence"], key=lambda x: x["acceptability"])[:3]
257
+ rewrites = []
258
+ for w in worst:
259
+ rewrites.append({"original": w["sentence"], "rewrite": suggest_rewrite(w["sentence"])})
260
+
261
+ dims = {
262
+ "Grammar": int(100*profile["grammar_avg"]),
263
+ "Redundancy (↓ better)": int(100*(1-profile["redundancy_avg"])),
264
+ "Coherence": int(100*profile["coherence"]),
265
+ }
266
+ img = bars_image(list(dims.keys()), list(dims.values()), title=f"Essay Score: {profile['score']}")
267
+
268
+ md = f"### Score: **{profile['score']}/100** \n"
269
+ md += f"- Word count: **{profile['word_count']}**\n"
270
+ md += f"- Grammar acceptability (avg): **{profile['grammar_avg']}**\n"
271
+ md += f"- Adjacent redundancy (avg cosine): **{profile['redundancy_avg']}** (lower is better)\n"
272
+ md += f"- Coherence: **{profile['coherence']}**\n"
273
+ if profile['cliches']:
274
+ md += f"- ClichΓ©s to remove: `{', '.join(profile['cliches'])}`\n"
275
+ md += "\n**Auto Summary:**\n> " + summary
276
+
277
+ rew_md = "\n\n**Rewrite Suggestions (lowest-acceptability sentences):**\n"
278
+ for r in rewrites:
279
+ rew_md += f"- **Original:** {r['original']}\n \n **Rewrite:** {r['rewrite']}\n\n"
280
+
281
+ _save_log("essay", profile["score"], profile)
282
+ return md + rew_md, img, src[:1200], json.dumps(profile, indent=2)[:1800]
283
+
284
+ # -----------------------------
285
+ # Resume / Activities
286
+ # -----------------------------
287
+ ACTION_VERBS = [
288
+ "led","built","created","launched","founded","organized","architected",
289
+ "automated","designed","initiated","streamlined","optimized","developed",
290
+ "engineered","authored","deployed","scaled","improved","won","achieved",
291
+ "delivered","analyzed","spearheaded","directed","mentored","taught",
292
+ "coordinated","presented","implemented","transformed","secured"
293
+ ]
294
+
295
+ def mlm_suggest(bullet: str) -> str:
296
+ low = bullet.lower().strip("-β€’ ")
297
+ has_action = any(low.startswith(v) for v in ACTION_VERBS)
298
+ text = bullet
299
+ if not has_action:
300
+ text = "<mask> " + bullet
301
+ tokens = _MLM_tok(text, return_tensors="pt")
302
+ logits = _MLM(**tokens).logits
303
+ mask_positions = (tokens["input_ids"] == _MLM_tok.mask_token_id).nonzero(as_tuple=True)
304
+ if len(mask_positions[1]) == 0:
305
+ return bullet
306
+ mask_idx = int(mask_positions[1][0])
307
+ probs = logits[0, mask_idx].softmax(-1)
308
+ ids = probs.topk(8).indices.tolist()
309
+ cands = [_MLM_tok.decode([i]).strip() for i in ids]
310
+ for c in cands:
311
+ if re.match(r"^[A-Za-z\-]+$", c) and c.lower() in ACTION_VERBS:
312
+ return c.capitalize() + " " + bullet
313
+ return cands[0].capitalize() + " " + bullet
314
+
315
+ def bullet_score(bullet: str) -> int:
316
+ low = bullet.lower()
317
+ action = any(low.startswith(v) for v in ACTION_VERBS)
318
+ nums = bool(re.search(r"\d", low))
319
+ outcome = bool(re.search(r"(result|impact|increase|decrease|grew|reduced|saved|won|revenue|users|raised|donations)", low))
320
+ length_ok = 8 <= len(re.findall(r"[A-Za-z0-9']+", bullet)) <= 30
321
+ score = 50 + 10*action + 10*nums + 10*outcome + 10*length_ok
322
+ return min(100, max(30, score))
323
+
324
+ def analyze_resume(text: str):
325
+ bullets = [b.strip() for b in text.split("\n") if b.strip()]
326
+ rows = []
327
+ for b in bullets:
328
+ s = bullet_score(b)
329
+ suggestion = mlm_suggest(b) if s < 80 else ""
330
+ rows.append([b, s, suggestion])
331
+ avg = int(np.mean([r[1] for r in rows])) if rows else 0
332
+ _save_log("resume", avg, {"n":len(rows)})
333
+ return pd.DataFrame(rows, columns=["Bullet","Score","Suggestion"]), avg
334
+
335
+ def resume_pipeline(resume_text, upload):
336
+ src = _clean(resume_text or "")
337
+ if (not src) and upload is not None:
338
+ src = ocr_local(upload)
339
+ df, avg = analyze_resume(src)
340
+ img = bars_image(["Avg","Min","Max"], [avg, int(df["Score"].min() if len(df)>0 else 0), int(df["Score"].max() if len(df)>0 else 0)], title="Resume Bullet Quality")
341
+ md = f"### Resume Analysis\n- Bullets: **{len(df)}**\n- Average quality: **{avg}**/100"
342
+ return df, img, md
343
+
344
+ # -----------------------------
345
+ # College Matcher (local dataset)
346
+ # -----------------------------
347
+ def match_colleges(gpa: float, sat: int, act: int, interest: str, state_pref: str):
348
+ df = COLLEGES.copy()
349
+ if sat and sat>0:
350
+ df["test_gap"] = (sat - df["sat_mid"])/200.0
351
+ elif act and act>0:
352
+ df["test_gap"] = (act - df["act_mid"])/4.0
353
+ else:
354
+ df["test_gap"] = 0.0
355
+ df["gpa_gap"] = (gpa - df["avg_gpa"])
356
+ if interest.lower() in ["stem","engineering","cs","math","physics"]:
357
+ df["fit"] = 0.5*df["gpa_gap"] + 0.5*df["test_gap"] + 0.3*(df["stem"]/5.0)
358
+ else:
359
+ df["fit"] = 0.5*df["gpa_gap"] + 0.5*df["test_gap"] + 0.3*(df["hum"]/5.0)
360
+ if state_pref:
361
+ df.loc[df["state"].str.lower()==state_pref.lower(), "fit"] += 0.15
362
+ tiers = []
363
+ for _, r in df.iterrows():
364
+ if r["fit"] >= 0.3:
365
+ tiers.append("Target")
366
+ elif r["fit"] >= 0.1:
367
+ tiers.append("Reach")
368
+ else:
369
+ tiers.append("High Reach")
370
+ df["tier"] = tiers
371
+ df = df.sort_values("fit", ascending=False)
372
+ cols = ["name","state","tier","fit","avg_gpa","sat_mid","act_mid","accept_rate"]
373
+ return df[cols].head(12)
374
+
375
+ def college_pipeline(gpa, sat, act, interest, state_pref):
376
+ try:
377
+ gpa = float(gpa) if gpa else 0.0
378
+ sat = int(sat) if sat else 0
379
+ act = int(act) if act else 0
380
+ except Exception:
381
+ gpa, sat, act = 0.0, 0, 0
382
+ df = match_colleges(gpa, sat, act, interest, state_pref)
383
+ buf = bars_image(df["name"].head(5).tolist(), [round(x,3) for x in df["fit"].head(5).tolist()], title="Top Fit")
384
+ return df, buf
385
+
386
+ # -----------------------------
387
+ # Interview Practice
388
+ # -----------------------------
389
+ MAJOR_QUESTIONS = {
390
+ "Computer Science": [
391
+ "Tell me about a time you debugged a difficult problem.",
392
+ "What's a software project you're proud of and why?",
393
+ "Explain an algorithm you like to a non-technical audience."
394
+ ],
395
+ "Business": [
396
+ "Describe a time you influenced someone without authority.",
397
+ "How did you analyze data to drive a decision?",
398
+ "Pitch a product for college students."
399
+ ],
400
+ "Biology": [
401
+ "What experiment taught you the most?",
402
+ "How do you evaluate the quality of a scientific source?",
403
+ "Explain CRISPR at a high level."
404
+ ],
405
+ "Humanities": [
406
+ "How has a book changed how you see the world?",
407
+ "Tell me about a debate that sharpened your thinking.",
408
+ "What does good writing mean to you?"
409
+ ]
410
+ }
411
+
412
+ def emo_scores(text: str) -> Dict[str,float]:
413
+ if not text.strip(): return {}
414
+ outs = _emo_pipe(text)[0]
415
+ return {o["label"]: float(o["score"]) for o in outs}
416
+
417
+ def interview_pipeline(major: str, behavioral: bool, answer: str):
418
+ if behavioral:
419
+ qs = [
420
+ "Tell me about a time you failed and what you learned.",
421
+ "Describe a conflict you resolved.",
422
+ "When did you change your mind about something important?",
423
+ "What's a project that best represents you and why?"
424
+ ]
425
+ else:
426
+ qs = MAJOR_QUESTIONS.get(major, MAJOR_QUESTIONS["Humanities"])
427
+ q = random.choice(qs)
428
+ sc = emo_scores(answer or "")
429
+ if sc:
430
+ top = sorted(sc.items(), key=lambda x: x[1], reverse=True)[:3]
431
+ md = "### Emotions detected\n" + "\n".join([f"- {k}: {round(v,3)}" for k,v in top])
432
+ else:
433
+ md = "### Emotions detected\n- (type an answer to analyze)"
434
+ img = bars_image(list(sc.keys()), [round(v,3) for v in sc.values()], title="Emotion Scores") if sc else None
435
+ return q, md, img
436
+
437
+ # -----------------------------
438
+ # Spike Planner + Charts + PDF
439
+ # -----------------------------
440
+ SPIKE_IDEAS = {
441
+ "AI/ML": [
442
+ "Build a model for a local nonprofit (forecast demand).",
443
+ "Open-source a dataset or evaluation tool.",
444
+ "Publish an 8-part 'ML intuition for teens' blog."
445
+ ],
446
+ "Finance": [
447
+ "Lead an investing club; run monthly backtests.",
448
+ "Survey students on budgeting; publish findings.",
449
+ "Prototype a student budgeting app."
450
+ ],
451
+ "Biotech": [
452
+ "Bioinformatics analysis of a public dataset.",
453
+ "Write a bioethics mini-series with interviews.",
454
+ "Organize a safe wet-lab collaboration session."
455
+ ]
456
+ }
457
+
458
+ def build_spike(interest: str, weeks: int=8):
459
+ ideas = SPIKE_IDEAS.get(interest, SPIKE_IDEAS["AI/ML"])
460
+ milestones = []
461
+ for w in range(weeks):
462
+ milestones.append({
463
+ "week": w+1,
464
+ "goal": f"Progress: {ideas[w%len(ideas)]}",
465
+ "deliverable": f"Week {w+1} demo/post"
466
+ })
467
+ return ideas, milestones
468
+
469
+ def export_ics(title: str, milestones):
470
+ now = dt.datetime.now()
471
+ lines = ["BEGIN:VCALENDAR","VERSION:2.0","PRODID:-//CGHF//EN"]
472
+ for m in milestones:
473
+ start = now + dt.timedelta(weeks=m["week"]-1)
474
+ dtstamp = now.strftime("%Y%m%dT%H%M%SZ")
475
+ dtstart = start.strftime("%Y%m%d")
476
+ uid = str(uuid.uuid4()) + "@cghf"
477
+ lines += [
478
+ "BEGIN:VEVENT",
479
+ f"UID:{uid}",
480
+ f"DTSTAMP:{dtstamp}",
481
+ f"DTSTART;VALUE=DATE:{dtstart}",
482
+ f"SUMMARY:{title} - Week {m['week']}: {m['deliverable']}",
483
+ f"DESCRIPTION:{m['goal']}",
484
+ "END:VEVENT"
485
+ ]
486
+ lines.append("END:VCALENDAR")
487
+ out = os.path.join(APP_DIR, "spike.ics")
488
+ with open(out, "w") as f:
489
+ f.write("\n".join(lines))
490
+ return out
491
+
492
+ def draw_gantt(milestones):
493
+ fig, ax = plt.subplots(figsize=(7,2+0.3*len(milestones)))
494
+ for i, m in enumerate(milestones):
495
+ ax.barh(i, 1, left=m["week"], height=0.4)
496
+ ax.text(m["week"]+0.05, i, f"W{m['week']} {m['deliverable']}", va="center")
497
+ ax.set_yticks(range(len(milestones)))
498
+ ax.set_yticklabels([f"W{m['week']}" for m in milestones])
499
+ ax.set_xlabel("Week")
500
+ ax.set_title("Spike Timeline")
501
+ buf = io.BytesIO()
502
+ plt.tight_layout()
503
+ plt.savefig(buf, format="png")
504
+ plt.close(fig)
505
+ buf.seek(0)
506
+ return buf.getvalue()
507
+
508
+ def portfolio_pdf(essay_fb, resume_df, colleges_df, spike_title, milestones, emotions_summary):
509
+ pdf = FPDF()
510
+ pdf.add_page()
511
+ pdf.set_font("Arial","B",16)
512
+ pdf.cell(0,10,"CollegeGenius HF – Portfolio", ln=True)
513
+
514
+ pdf.set_font("Arial","",12)
515
+ pdf.cell(0,8,"Essay Summary", ln=True)
516
+ for k in ["score","word_count","grammar_avg","redundancy_avg","coherence"]:
517
+ if k in essay_fb:
518
+ pdf.cell(0,6,f"- {k}: {essay_fb[k]}", ln=True)
519
+
520
+ pdf.ln(3)
521
+ pdf.cell(0,8,"Resume Highlights (top 5)", ln=True)
522
+ try:
523
+ rows = resume_df.values.tolist()
524
+ except Exception:
525
+ rows = []
526
+ for r in rows[:5]:
527
+ pdf.multi_cell(0,6,f"β€’ [{r[1]}] {r[0]}")
528
+ if r[2]:
529
+ pdf.multi_cell(0,6,f" Suggestion: {r[2]}")
530
+
531
+ pdf.ln(3)
532
+ pdf.cell(0,8,"College Matches", ln=True)
533
+ try:
534
+ for _, row in colleges_df.head(6).iterrows():
535
+ pdf.cell(0,6,f"- {row['name']} ({row['tier']}) | Fit={round(row['fit'],3)}", ln=True)
536
+ except Exception:
537
+ pass
538
+
539
+ pdf.ln(3)
540
+ pdf.cell(0,8,f"Spike: {spike_title}", ln=True)
541
+ for m in milestones[:6]:
542
+ pdf.cell(0,6,f"Week {m['week']}: {m['deliverable']} – {m['goal']}", ln=True)
543
+
544
+ pdf.ln(3)
545
+ pdf.cell(0,8,"Interview Emotions", ln=True)
546
+ for k,v in emotions_summary.items():
547
+ pdf.cell(0,6,f"- {k}: {round(v,3)}", ln=True)
548
+
549
+ out = os.path.join(APP_DIR, "portfolio.pdf")
550
+ pdf.output(out)
551
+ return out
552
+
553
+ # -----------------------------
554
+ # Viz helper
555
+ # -----------------------------
556
+ def bars_image(labels, values, title=""):
557
+ fig, ax = plt.subplots()
558
+ ax.bar(labels, values)
559
+ ax.set_title(title)
560
+ ax.set_ylim(0, max(1, max(values) if values else 1))
561
+ buf = io.BytesIO()
562
+ plt.tight_layout()
563
+ plt.savefig(buf, format="png")
564
+ plt.close(fig)
565
+ buf.seek(0)
566
+ return buf.getvalue()
567
+
568
+ # -----------------------------
569
+ # Spike pipeline (also returns milestones json)
570
+ # -----------------------------
571
+ def spike_pipeline(interest, weeks, title_hint):
572
+ ideas, milestones = build_spike(interest, weeks)
573
+ ics_path = export_ics(title_hint or f"{interest} Spike", milestones)
574
+ md = "### Spike Plan\n" + "\n".join([f"- W{m['week']}: {m['deliverable']} β€” {m['goal']}" for m in milestones])
575
+ gantt = draw_gantt(milestones)
576
+ return md, ideas, ics_path, gantt, json.dumps(milestones)
577
+
578
+ # -----------------------------
579
+ # Build PDF pipeline
580
+ # -----------------------------
581
+ def essay_to_pdf_pipeline(essay_json, resume_df, college_df, spike_title, milestones_json, emo_json):
582
+ try:
583
+ essay = json.loads(essay_json)
584
+ except Exception:
585
+ essay = {}
586
+ try:
587
+ mil = json.loads(milestones_json)
588
+ except Exception:
589
+ mil = [{"week":1,"goal":"Start","deliverable":"Kickoff"}]
590
+ try:
591
+ emo = json.loads(emo_json) if emo_json else {}
592
+ except Exception:
593
+ emo = {}
594
+ out = portfolio_pdf(essay, resume_df if isinstance(resume_df,pd.DataFrame) else pd.DataFrame(), college_df if isinstance(college_df,pd.DataFrame) else COLLEGES, spike_title or "Spike", mil, emo)
595
+ return out
596
+
597
+ # -----------------------------
598
+ # Gradio UI
599
+ # -----------------------------
600
+ with gr.Blocks(title="CollegeGenius HF", theme=gr.themes.Soft()) as demo:
601
+ gr.Markdown("# πŸŽ“ CollegeGenius HF – All Local, All Hugging Face\nNo API keys. Everything runs with open models from the Hugging Face Hub.")
602
+ with gr.Tab("πŸ“ Essay Coach"):
603
+ prompt = gr.Textbox(label="Prompt (optional)", lines=2)
604
+ essay_text = gr.Textbox(label="Paste essay text", lines=14)
605
+ upload_essay = gr.File(label="Upload image/PDF (optional; OCR is local-only)")
606
+ btn_essay = gr.Button("Analyze Essay", variant="primary")
607
+ essay_md = gr.Markdown()
608
+ essay_plot = gr.Image(label="Essay Diagnostics")
609
+ essay_excerpt = gr.Textbox(label="First 1200 chars", lines=6)
610
+ essay_json = gr.Textbox(label="Essay JSON (for PDF)", lines=6)
611
+ btn_essay.click(essay_pipeline, inputs=[prompt, essay_text, upload_essay],
612
+ outputs=[essay_md, essay_plot, essay_excerpt, essay_json])
613
+
614
+ with gr.Tab("πŸ“„ Resume & Activities"):
615
+ resume_text = gr.Textbox(label="Paste resume bullets (one per line)", lines=10)
616
+ upload_resume = gr.File(label="Upload image/PDF (optional)")
617
+ btn_resume = gr.Button("Analyze Resume", variant="primary")
618
+ resume_df = gr.Dataframe(headers=["Bullet","Score","Suggestion"], interactive=False, wrap=True)
619
+ resume_img = gr.Image(label="Quality Snapshot")
620
+ resume_md = gr.Markdown()
621
+ btn_resume.click(resume_pipeline, inputs=[resume_text, upload_resume],
622
+ outputs=[resume_df, resume_img, resume_md])
623
+
624
+ with gr.Tab("🏫 College Matcher"):
625
+ with gr.Row():
626
+ gpa = gr.Textbox(label="Unweighted GPA (e.g., 3.8)")
627
+ sat = gr.Textbox(label="SAT (1600)")
628
+ act = gr.Textbox(label="ACT (36)")
629
+ with gr.Row():
630
+ interest = gr.Dropdown(["STEM","Humanities","Business","Biology","CS","Engineering"], value="STEM", label="Interest")
631
+ state_pref = gr.Textbox(label="State preference (optional)")
632
+ btn_match = gr.Button("Find Matches", variant="primary")
633
+ college_df = gr.Dataframe(interactive=False, wrap=True)
634
+ college_plot = gr.Image(label="Top Fit")
635
+ btn_match.click(college_pipeline, inputs=[gpa, sat, act, interest, state_pref], outputs=[college_df, college_plot])
636
+
637
+ with gr.Tab("πŸš€ Spike Planner"):
638
+ with gr.Row():
639
+ sp_interest = gr.Dropdown(list(SPIKE_IDEAS.keys()), value="AI/ML", label="Interest")
640
+ weeks = gr.Slider(4, 16, value=8, step=1, label="Weeks")
641
+ sp_title = gr.Textbox(label="Spike Title (optional)")
642
+ btn_spike = gr.Button("Generate Plan", variant="primary")
643
+ sp_md = gr.Markdown()
644
+ sp_ideas = gr.HighlightedText(label="Idea Starters", combine_adjacent=True)
645
+ sp_ics = gr.File(label="Calendar (.ics)")
646
+ sp_gantt = gr.Image(label="Gantt")
647
+ sp_json = gr.Textbox(label="Milestones JSON", lines=6)
648
+ btn_spike.click(spike_pipeline, inputs=[sp_interest, weeks, sp_title],
649
+ outputs=[sp_md, sp_ideas, sp_ics, sp_gantt, sp_json])
650
+
651
+ with gr.Tab("🎀 Interview Practice"):
652
+ major = gr.Dropdown(list(MAJOR_QUESTIONS.keys()), value="Computer Science", label="Major")
653
+ behavioral = gr.Checkbox(value=True, label="Behavioral?")
654
+ answer = gr.Textbox(label="Your answer (type to analyze emotions)", lines=8)
655
+ btn_interview = gr.Button("Get Question + Analyze", variant="primary")
656
+ iv_q = gr.Textbox(label="Question", lines=2)
657
+ iv_md = gr.Markdown()
658
+ iv_img = gr.Image(label="Emotion Scores")
659
+ btn_interview.click(interview_pipeline, inputs=[major, behavioral, answer], outputs=[iv_q, iv_md, iv_img])
660
+
661
+ with gr.Tab("πŸ“š Build Portfolio PDF"):
662
+ gr.Markdown("Combine everything into a shareable PDF.")
663
+ spike_title_in = gr.Textbox(label="Spike Title", value="My Spike")
664
+ emo_json = gr.Textbox(label="Emotions JSON (from Interview tab; optional)", lines=4)
665
+ btn_pdf = gr.Button("Build PDF", variant="primary")
666
+ pdf_out = gr.File()
667
+ btn_pdf.click(essay_to_pdf_pipeline, inputs=[essay_json, resume_df, college_df, spike_title_in, sp_json, emo_json], outputs=[pdf_out])
668
+
669
+ app = demo
670
+
671
+ if __name__ == "__main__":
672
+ demo.queue(concurrency_count=2, max_size=32).launch()