Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,672 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# CollegeGenius HF β 100% Hugging Face, No External APIs
|
3 |
+
# ------------------------------------------------------
|
4 |
+
# Everything runs inside the Space using open models from the Hugging Face Hub.
|
5 |
+
# No thirdβparty web APIs or keys required.
|
6 |
+
#
|
7 |
+
# Models:
|
8 |
+
# - sentence-transformers/paraphrase-MiniLM-L6-v2 (embeddings)
|
9 |
+
# - textattack/roberta-base-CoLA (grammar acceptability per sentence)
|
10 |
+
# - sshleifer/distilbart-cnn-12-6 (summarization / light rewriting)
|
11 |
+
# - j-hartmann/emotion-english-distilroberta-base (emotion for interviews)
|
12 |
+
# - distilroberta-base (masked LM for resume verb suggestions)
|
13 |
+
#
|
14 |
+
# Launch config is at bottom: demo.queue(...).launch()
|
15 |
+
# ------------------------------------------------------
|
16 |
+
|
17 |
+
import os
|
18 |
+
import re
|
19 |
+
import io
|
20 |
+
import json
|
21 |
+
import uuid
|
22 |
+
import random
|
23 |
+
import datetime as dt
|
24 |
+
from typing import List, Dict, Any
|
25 |
+
|
26 |
+
import numpy as np
|
27 |
+
import pandas as pd
|
28 |
+
import matplotlib.pyplot as plt
|
29 |
+
|
30 |
+
import gradio as gr
|
31 |
+
from fpdf import FPDF
|
32 |
+
|
33 |
+
# Optional OCR (local only)
|
34 |
+
try:
|
35 |
+
import easyocr
|
36 |
+
_OCR = easyocr.Reader(['en'], gpu=False)
|
37 |
+
except Exception:
|
38 |
+
_OCR = None
|
39 |
+
|
40 |
+
# Hugging Face models
|
41 |
+
from transformers import (
|
42 |
+
AutoTokenizer,
|
43 |
+
AutoModelForSequenceClassification,
|
44 |
+
AutoModelForSeq2SeqLM,
|
45 |
+
AutoModelForMaskedLM,
|
46 |
+
pipeline,
|
47 |
+
set_seed,
|
48 |
+
)
|
49 |
+
from sentence_transformers import SentenceTransformer
|
50 |
+
|
51 |
+
APP_DIR = "cghf_data"
|
52 |
+
os.makedirs(APP_DIR, exist_ok=True)
|
53 |
+
|
54 |
+
LOG_CSV = os.path.join(APP_DIR, "runs.csv")
|
55 |
+
if not os.path.exists(LOG_CSV):
|
56 |
+
pd.DataFrame(columns=["timestamp","tool","score","meta"]).to_csv(LOG_CSV, index=False)
|
57 |
+
|
58 |
+
set_seed(42)
|
59 |
+
|
60 |
+
# -----------------------------
|
61 |
+
# Load models
|
62 |
+
# -----------------------------
|
63 |
+
_EMBEDDER = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
|
64 |
+
|
65 |
+
# Grammar acceptability (CoLA)
|
66 |
+
_CoLA_tok = AutoTokenizer.from_pretrained("textattack/roberta-base-CoLA")
|
67 |
+
_CoLA = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-CoLA")
|
68 |
+
_acc_pipe = pipeline("text-classification", model=_CoLA, tokenizer=_CoLA_tok, return_all_scores=True)
|
69 |
+
|
70 |
+
# Summarizer / lightweight rewriting
|
71 |
+
_SUM_tok = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
|
72 |
+
_SUM = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
|
73 |
+
_sum_pipe = pipeline("summarization", model=_SUM, tokenizer=_SUM_tok)
|
74 |
+
|
75 |
+
# Interview emotion
|
76 |
+
_EMO_tok = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
|
77 |
+
_EMO = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
|
78 |
+
_emo_pipe = pipeline("text-classification", model=_EMO, tokenizer=_EMO_tok, top_k=None)
|
79 |
+
|
80 |
+
# Masked LM for resume suggestions
|
81 |
+
_MLM_tok = AutoTokenizer.from_pretrained("distilroberta-base")
|
82 |
+
_MLM = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
|
83 |
+
|
84 |
+
# -----------------------------
|
85 |
+
# Small built-in college dataset
|
86 |
+
# -----------------------------
|
87 |
+
COLLEGES = pd.DataFrame([
|
88 |
+
("MIT","MA",3.9,1550,35,0.04,5,4),
|
89 |
+
("Stanford","CA",3.95,1540,35,0.04,5,5),
|
90 |
+
("Harvard","MA",3.95,1540,35,0.04,5,5),
|
91 |
+
("UC Berkeley","CA",3.85,1470,33,0.15,5,4),
|
92 |
+
("Michigan","MI",3.85,1470,33,0.20,5,4),
|
93 |
+
("Georgia Tech","GA",3.85,1480,34,0.17,5,3),
|
94 |
+
("UT Austin","TX",3.8,1420,32,0.31,4,4),
|
95 |
+
("CMU","PA",3.9,1530,35,0.14,5,4),
|
96 |
+
("Princeton","NJ",3.95,1540,35,0.05,5,5),
|
97 |
+
("Yale","CT",3.95,1540,35,0.05,5,5),
|
98 |
+
("Brown","RI",3.9,1510,34,0.06,4,5),
|
99 |
+
("Duke","NC",3.9,1520,34,0.07,5,5),
|
100 |
+
("Northwestern","IL",3.9,1500,34,0.07,4,5),
|
101 |
+
("NYU","NY",3.7,1450,33,0.13,4,5),
|
102 |
+
("UF","FL",3.8,1400,31,0.31,4,3),
|
103 |
+
("Purdue","IN",3.7,1380,30,0.53,4,3),
|
104 |
+
("Virginia Tech","VA",3.7,1360,30,0.57,4,3),
|
105 |
+
("UW","WA",3.75,1410,32,0.48,4,4),
|
106 |
+
("UIUC","IL",3.75,1420,32,0.45,5,3),
|
107 |
+
("UCLA","CA",3.9,1500,34,0.09,4,5)
|
108 |
+
], columns=["name","state","avg_gpa","sat_mid","act_mid","accept_rate","stem","hum"])
|
109 |
+
|
110 |
+
# -----------------------------
|
111 |
+
# Utils
|
112 |
+
# -----------------------------
|
113 |
+
def _now():
|
114 |
+
return dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
115 |
+
|
116 |
+
def _clean(s: str) -> str:
|
117 |
+
if not isinstance(s, str): return ""
|
118 |
+
return re.sub(r"\s+"," ",s).strip()
|
119 |
+
|
120 |
+
def _sentences(text: str) -> List[str]:
|
121 |
+
text = _clean(text)
|
122 |
+
if not text: return []
|
123 |
+
sents = re.split(r"(?<=[.!?])\s+", text)
|
124 |
+
return [s for s in sents if s]
|
125 |
+
|
126 |
+
def _save_log(tool, score, meta):
|
127 |
+
df = pd.read_csv(LOG_CSV)
|
128 |
+
row = {"timestamp": _now(), "tool": tool, "score": score if score is not None else "", "meta": json.dumps(meta)[:1200]}
|
129 |
+
df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
|
130 |
+
df.to_csv(LOG_CSV, index=False)
|
131 |
+
|
132 |
+
def ocr_local(file) -> str:
|
133 |
+
if not file: return ""
|
134 |
+
path = file.name if hasattr(file, "name") else str(file)
|
135 |
+
if path.lower().endswith(".pdf"):
|
136 |
+
try:
|
137 |
+
from pypdf import PdfReader
|
138 |
+
pages = []
|
139 |
+
reader = PdfReader(path)
|
140 |
+
for p in reader.pages:
|
141 |
+
pages.append(p.extract_text() or "")
|
142 |
+
t = "\n".join(pages)
|
143 |
+
if t.strip(): return _clean(t)
|
144 |
+
except Exception:
|
145 |
+
pass
|
146 |
+
return "PDF text not found. Export as text PDF or upload an image. (No web OCR used.)"
|
147 |
+
else:
|
148 |
+
if _OCR:
|
149 |
+
res = _OCR.readtext(path, detail=0, paragraph=True)
|
150 |
+
return _clean(" ".join(res))
|
151 |
+
return "OCR not available locally. Install `easyocr` or paste text."
|
152 |
+
|
153 |
+
# -----------------------------
|
154 |
+
# Essay AI (HF-only)
|
155 |
+
# -----------------------------
|
156 |
+
CLICHES = [
|
157 |
+
"ever since i was a child","since the dawn of time","follow my dreams",
|
158 |
+
"changed my life forever","made me who i am today","i have always loved",
|
159 |
+
"the true meaning","anything is possible","importance of hard work",
|
160 |
+
"the power of perseverance"
|
161 |
+
]
|
162 |
+
|
163 |
+
def acceptability_score(sentence: str) -> float:
|
164 |
+
out = _acc_pipe(sentence)[0]
|
165 |
+
scores = {o["label"]: o["score"] for o in out}
|
166 |
+
return float(scores.get("LABEL_1", 0.0))
|
167 |
+
|
168 |
+
def essay_grammar_profile(text: str) -> Dict[str, Any]:
|
169 |
+
sents = _sentences(text)
|
170 |
+
if not sents: return {"avg_acceptability":0, "per_sentence":[]}
|
171 |
+
per = [{"sentence": s, "acceptability": round(acceptability_score(s),3)} for s in sents]
|
172 |
+
avg = float(np.mean([p["acceptability"] for p in per]))
|
173 |
+
return {"avg_acceptability": round(avg,3), "per_sentence": per}
|
174 |
+
|
175 |
+
def summarize_text(text: str, max_words=120) -> str:
|
176 |
+
if not text.strip(): return ""
|
177 |
+
chunks = []
|
178 |
+
words = text.split()
|
179 |
+
step = 350
|
180 |
+
for i in range(0, len(words), step):
|
181 |
+
chunk = " ".join(words[i:i+step])
|
182 |
+
if len(chunk) < 10: continue
|
183 |
+
s = _sum_pipe(chunk, max_length=128, min_length=56, do_sample=False)[0]["summary_text"]
|
184 |
+
chunks.append(s)
|
185 |
+
out = " ".join(chunks) if chunks else _sum_pipe(text, max_length=150, min_length=60, do_sample=False)[0]["summary_text"]
|
186 |
+
return " ".join(out.split()[:max_words])
|
187 |
+
|
188 |
+
def redundancy_map(text: str) -> Dict[str, Any]:
|
189 |
+
sents = _sentences(text)
|
190 |
+
if len(sents) < 2:
|
191 |
+
return {"avg_sim": 0.0, "pairs": []}
|
192 |
+
embs = _EMBEDDER.encode(sents, normalize_embeddings=True)
|
193 |
+
sims = []
|
194 |
+
for i in range(len(sents)-1):
|
195 |
+
sim = float(np.dot(embs[i], embs[i+1]))
|
196 |
+
sims.append({"pair": (i, i+1), "sim": round(sim,3)})
|
197 |
+
avg = float(np.mean([p["sim"] for p in sims])) if sims else 0.0
|
198 |
+
return {"avg_sim": round(avg,3), "pairs": sims}
|
199 |
+
|
200 |
+
def coherence_score(text: str) -> float:
|
201 |
+
sents = _sentences(text)
|
202 |
+
if len(sents) < 3: return 0.5
|
203 |
+
embs = _EMBEDDER.encode(sents, normalize_embeddings=True)
|
204 |
+
sims = []
|
205 |
+
for i in range(len(sents)-2):
|
206 |
+
sims.append(float(np.dot(embs[i], embs[i+2])))
|
207 |
+
score = float(np.mean(sims))
|
208 |
+
return round((score+1)/2, 3)
|
209 |
+
|
210 |
+
def cliche_hits(text: str) -> List[str]:
|
211 |
+
low = text.lower()
|
212 |
+
return [c for c in CLICHES if c in low]
|
213 |
+
|
214 |
+
def suggest_rewrite(sentence: str, max_len=64) -> str:
|
215 |
+
s = sentence.strip()
|
216 |
+
if len(s.split()) < 8:
|
217 |
+
return s
|
218 |
+
try:
|
219 |
+
out = _sum_pipe(s, max_length=min(max_len, 72), min_length=12, do_sample=False)[0]["summary_text"]
|
220 |
+
return out
|
221 |
+
except Exception:
|
222 |
+
return s
|
223 |
+
|
224 |
+
def essay_score(text: str) -> Dict[str, Any]:
|
225 |
+
wc = len(re.findall(r"[A-Za-z0-9']+", text))
|
226 |
+
grammar = essay_grammar_profile(text)
|
227 |
+
redun = redundancy_map(text)
|
228 |
+
coher = coherence_score(text)
|
229 |
+
clichΓ©s = cliche_hits(text)
|
230 |
+
s = (
|
231 |
+
0.35 * grammar["avg_acceptability"] +
|
232 |
+
0.25 * (1 - max(0, redun["avg_sim"] - 0.6)) +
|
233 |
+
0.25 * coher +
|
234 |
+
0.15 * max(0, min(1, 800 - abs(650 - wc)) / 800)
|
235 |
+
)
|
236 |
+
total = int(round(100 * max(0.0, min(1.0, s))))
|
237 |
+
return {
|
238 |
+
"score": total,
|
239 |
+
"word_count": wc,
|
240 |
+
"grammar_avg": grammar["avg_acceptability"],
|
241 |
+
"redundancy_avg": redun["avg_sim"],
|
242 |
+
"coherence": coher,
|
243 |
+
"cliches": clichΓ©s,
|
244 |
+
"per_sentence": grammar["per_sentence"]
|
245 |
+
}
|
246 |
+
|
247 |
+
def essay_pipeline(prompt, essay_text, upload):
|
248 |
+
src = _clean(essay_text or "")
|
249 |
+
if (not src) and upload is not None:
|
250 |
+
src = ocr_local(upload)
|
251 |
+
if not src or src.startswith("OCR not available") or src.startswith("PDF text not found"):
|
252 |
+
return src or "No text found.", None, None, None
|
253 |
+
|
254 |
+
profile = essay_score(src)
|
255 |
+
summary = summarize_text(src)
|
256 |
+
worst = sorted(profile["per_sentence"], key=lambda x: x["acceptability"])[:3]
|
257 |
+
rewrites = []
|
258 |
+
for w in worst:
|
259 |
+
rewrites.append({"original": w["sentence"], "rewrite": suggest_rewrite(w["sentence"])})
|
260 |
+
|
261 |
+
dims = {
|
262 |
+
"Grammar": int(100*profile["grammar_avg"]),
|
263 |
+
"Redundancy (β better)": int(100*(1-profile["redundancy_avg"])),
|
264 |
+
"Coherence": int(100*profile["coherence"]),
|
265 |
+
}
|
266 |
+
img = bars_image(list(dims.keys()), list(dims.values()), title=f"Essay Score: {profile['score']}")
|
267 |
+
|
268 |
+
md = f"### Score: **{profile['score']}/100** \n"
|
269 |
+
md += f"- Word count: **{profile['word_count']}**\n"
|
270 |
+
md += f"- Grammar acceptability (avg): **{profile['grammar_avg']}**\n"
|
271 |
+
md += f"- Adjacent redundancy (avg cosine): **{profile['redundancy_avg']}** (lower is better)\n"
|
272 |
+
md += f"- Coherence: **{profile['coherence']}**\n"
|
273 |
+
if profile['cliches']:
|
274 |
+
md += f"- ClichΓ©s to remove: `{', '.join(profile['cliches'])}`\n"
|
275 |
+
md += "\n**Auto Summary:**\n> " + summary
|
276 |
+
|
277 |
+
rew_md = "\n\n**Rewrite Suggestions (lowest-acceptability sentences):**\n"
|
278 |
+
for r in rewrites:
|
279 |
+
rew_md += f"- **Original:** {r['original']}\n \n **Rewrite:** {r['rewrite']}\n\n"
|
280 |
+
|
281 |
+
_save_log("essay", profile["score"], profile)
|
282 |
+
return md + rew_md, img, src[:1200], json.dumps(profile, indent=2)[:1800]
|
283 |
+
|
284 |
+
# -----------------------------
|
285 |
+
# Resume / Activities
|
286 |
+
# -----------------------------
|
287 |
+
ACTION_VERBS = [
|
288 |
+
"led","built","created","launched","founded","organized","architected",
|
289 |
+
"automated","designed","initiated","streamlined","optimized","developed",
|
290 |
+
"engineered","authored","deployed","scaled","improved","won","achieved",
|
291 |
+
"delivered","analyzed","spearheaded","directed","mentored","taught",
|
292 |
+
"coordinated","presented","implemented","transformed","secured"
|
293 |
+
]
|
294 |
+
|
295 |
+
def mlm_suggest(bullet: str) -> str:
|
296 |
+
low = bullet.lower().strip("-β’ ")
|
297 |
+
has_action = any(low.startswith(v) for v in ACTION_VERBS)
|
298 |
+
text = bullet
|
299 |
+
if not has_action:
|
300 |
+
text = "<mask> " + bullet
|
301 |
+
tokens = _MLM_tok(text, return_tensors="pt")
|
302 |
+
logits = _MLM(**tokens).logits
|
303 |
+
mask_positions = (tokens["input_ids"] == _MLM_tok.mask_token_id).nonzero(as_tuple=True)
|
304 |
+
if len(mask_positions[1]) == 0:
|
305 |
+
return bullet
|
306 |
+
mask_idx = int(mask_positions[1][0])
|
307 |
+
probs = logits[0, mask_idx].softmax(-1)
|
308 |
+
ids = probs.topk(8).indices.tolist()
|
309 |
+
cands = [_MLM_tok.decode([i]).strip() for i in ids]
|
310 |
+
for c in cands:
|
311 |
+
if re.match(r"^[A-Za-z\-]+$", c) and c.lower() in ACTION_VERBS:
|
312 |
+
return c.capitalize() + " " + bullet
|
313 |
+
return cands[0].capitalize() + " " + bullet
|
314 |
+
|
315 |
+
def bullet_score(bullet: str) -> int:
|
316 |
+
low = bullet.lower()
|
317 |
+
action = any(low.startswith(v) for v in ACTION_VERBS)
|
318 |
+
nums = bool(re.search(r"\d", low))
|
319 |
+
outcome = bool(re.search(r"(result|impact|increase|decrease|grew|reduced|saved|won|revenue|users|raised|donations)", low))
|
320 |
+
length_ok = 8 <= len(re.findall(r"[A-Za-z0-9']+", bullet)) <= 30
|
321 |
+
score = 50 + 10*action + 10*nums + 10*outcome + 10*length_ok
|
322 |
+
return min(100, max(30, score))
|
323 |
+
|
324 |
+
def analyze_resume(text: str):
|
325 |
+
bullets = [b.strip() for b in text.split("\n") if b.strip()]
|
326 |
+
rows = []
|
327 |
+
for b in bullets:
|
328 |
+
s = bullet_score(b)
|
329 |
+
suggestion = mlm_suggest(b) if s < 80 else ""
|
330 |
+
rows.append([b, s, suggestion])
|
331 |
+
avg = int(np.mean([r[1] for r in rows])) if rows else 0
|
332 |
+
_save_log("resume", avg, {"n":len(rows)})
|
333 |
+
return pd.DataFrame(rows, columns=["Bullet","Score","Suggestion"]), avg
|
334 |
+
|
335 |
+
def resume_pipeline(resume_text, upload):
|
336 |
+
src = _clean(resume_text or "")
|
337 |
+
if (not src) and upload is not None:
|
338 |
+
src = ocr_local(upload)
|
339 |
+
df, avg = analyze_resume(src)
|
340 |
+
img = bars_image(["Avg","Min","Max"], [avg, int(df["Score"].min() if len(df)>0 else 0), int(df["Score"].max() if len(df)>0 else 0)], title="Resume Bullet Quality")
|
341 |
+
md = f"### Resume Analysis\n- Bullets: **{len(df)}**\n- Average quality: **{avg}**/100"
|
342 |
+
return df, img, md
|
343 |
+
|
344 |
+
# -----------------------------
|
345 |
+
# College Matcher (local dataset)
|
346 |
+
# -----------------------------
|
347 |
+
def match_colleges(gpa: float, sat: int, act: int, interest: str, state_pref: str):
|
348 |
+
df = COLLEGES.copy()
|
349 |
+
if sat and sat>0:
|
350 |
+
df["test_gap"] = (sat - df["sat_mid"])/200.0
|
351 |
+
elif act and act>0:
|
352 |
+
df["test_gap"] = (act - df["act_mid"])/4.0
|
353 |
+
else:
|
354 |
+
df["test_gap"] = 0.0
|
355 |
+
df["gpa_gap"] = (gpa - df["avg_gpa"])
|
356 |
+
if interest.lower() in ["stem","engineering","cs","math","physics"]:
|
357 |
+
df["fit"] = 0.5*df["gpa_gap"] + 0.5*df["test_gap"] + 0.3*(df["stem"]/5.0)
|
358 |
+
else:
|
359 |
+
df["fit"] = 0.5*df["gpa_gap"] + 0.5*df["test_gap"] + 0.3*(df["hum"]/5.0)
|
360 |
+
if state_pref:
|
361 |
+
df.loc[df["state"].str.lower()==state_pref.lower(), "fit"] += 0.15
|
362 |
+
tiers = []
|
363 |
+
for _, r in df.iterrows():
|
364 |
+
if r["fit"] >= 0.3:
|
365 |
+
tiers.append("Target")
|
366 |
+
elif r["fit"] >= 0.1:
|
367 |
+
tiers.append("Reach")
|
368 |
+
else:
|
369 |
+
tiers.append("High Reach")
|
370 |
+
df["tier"] = tiers
|
371 |
+
df = df.sort_values("fit", ascending=False)
|
372 |
+
cols = ["name","state","tier","fit","avg_gpa","sat_mid","act_mid","accept_rate"]
|
373 |
+
return df[cols].head(12)
|
374 |
+
|
375 |
+
def college_pipeline(gpa, sat, act, interest, state_pref):
|
376 |
+
try:
|
377 |
+
gpa = float(gpa) if gpa else 0.0
|
378 |
+
sat = int(sat) if sat else 0
|
379 |
+
act = int(act) if act else 0
|
380 |
+
except Exception:
|
381 |
+
gpa, sat, act = 0.0, 0, 0
|
382 |
+
df = match_colleges(gpa, sat, act, interest, state_pref)
|
383 |
+
buf = bars_image(df["name"].head(5).tolist(), [round(x,3) for x in df["fit"].head(5).tolist()], title="Top Fit")
|
384 |
+
return df, buf
|
385 |
+
|
386 |
+
# -----------------------------
|
387 |
+
# Interview Practice
|
388 |
+
# -----------------------------
|
389 |
+
MAJOR_QUESTIONS = {
|
390 |
+
"Computer Science": [
|
391 |
+
"Tell me about a time you debugged a difficult problem.",
|
392 |
+
"What's a software project you're proud of and why?",
|
393 |
+
"Explain an algorithm you like to a non-technical audience."
|
394 |
+
],
|
395 |
+
"Business": [
|
396 |
+
"Describe a time you influenced someone without authority.",
|
397 |
+
"How did you analyze data to drive a decision?",
|
398 |
+
"Pitch a product for college students."
|
399 |
+
],
|
400 |
+
"Biology": [
|
401 |
+
"What experiment taught you the most?",
|
402 |
+
"How do you evaluate the quality of a scientific source?",
|
403 |
+
"Explain CRISPR at a high level."
|
404 |
+
],
|
405 |
+
"Humanities": [
|
406 |
+
"How has a book changed how you see the world?",
|
407 |
+
"Tell me about a debate that sharpened your thinking.",
|
408 |
+
"What does good writing mean to you?"
|
409 |
+
]
|
410 |
+
}
|
411 |
+
|
412 |
+
def emo_scores(text: str) -> Dict[str,float]:
|
413 |
+
if not text.strip(): return {}
|
414 |
+
outs = _emo_pipe(text)[0]
|
415 |
+
return {o["label"]: float(o["score"]) for o in outs}
|
416 |
+
|
417 |
+
def interview_pipeline(major: str, behavioral: bool, answer: str):
|
418 |
+
if behavioral:
|
419 |
+
qs = [
|
420 |
+
"Tell me about a time you failed and what you learned.",
|
421 |
+
"Describe a conflict you resolved.",
|
422 |
+
"When did you change your mind about something important?",
|
423 |
+
"What's a project that best represents you and why?"
|
424 |
+
]
|
425 |
+
else:
|
426 |
+
qs = MAJOR_QUESTIONS.get(major, MAJOR_QUESTIONS["Humanities"])
|
427 |
+
q = random.choice(qs)
|
428 |
+
sc = emo_scores(answer or "")
|
429 |
+
if sc:
|
430 |
+
top = sorted(sc.items(), key=lambda x: x[1], reverse=True)[:3]
|
431 |
+
md = "### Emotions detected\n" + "\n".join([f"- {k}: {round(v,3)}" for k,v in top])
|
432 |
+
else:
|
433 |
+
md = "### Emotions detected\n- (type an answer to analyze)"
|
434 |
+
img = bars_image(list(sc.keys()), [round(v,3) for v in sc.values()], title="Emotion Scores") if sc else None
|
435 |
+
return q, md, img
|
436 |
+
|
437 |
+
# -----------------------------
|
438 |
+
# Spike Planner + Charts + PDF
|
439 |
+
# -----------------------------
|
440 |
+
SPIKE_IDEAS = {
|
441 |
+
"AI/ML": [
|
442 |
+
"Build a model for a local nonprofit (forecast demand).",
|
443 |
+
"Open-source a dataset or evaluation tool.",
|
444 |
+
"Publish an 8-part 'ML intuition for teens' blog."
|
445 |
+
],
|
446 |
+
"Finance": [
|
447 |
+
"Lead an investing club; run monthly backtests.",
|
448 |
+
"Survey students on budgeting; publish findings.",
|
449 |
+
"Prototype a student budgeting app."
|
450 |
+
],
|
451 |
+
"Biotech": [
|
452 |
+
"Bioinformatics analysis of a public dataset.",
|
453 |
+
"Write a bioethics mini-series with interviews.",
|
454 |
+
"Organize a safe wet-lab collaboration session."
|
455 |
+
]
|
456 |
+
}
|
457 |
+
|
458 |
+
def build_spike(interest: str, weeks: int=8):
|
459 |
+
ideas = SPIKE_IDEAS.get(interest, SPIKE_IDEAS["AI/ML"])
|
460 |
+
milestones = []
|
461 |
+
for w in range(weeks):
|
462 |
+
milestones.append({
|
463 |
+
"week": w+1,
|
464 |
+
"goal": f"Progress: {ideas[w%len(ideas)]}",
|
465 |
+
"deliverable": f"Week {w+1} demo/post"
|
466 |
+
})
|
467 |
+
return ideas, milestones
|
468 |
+
|
469 |
+
def export_ics(title: str, milestones):
|
470 |
+
now = dt.datetime.now()
|
471 |
+
lines = ["BEGIN:VCALENDAR","VERSION:2.0","PRODID:-//CGHF//EN"]
|
472 |
+
for m in milestones:
|
473 |
+
start = now + dt.timedelta(weeks=m["week"]-1)
|
474 |
+
dtstamp = now.strftime("%Y%m%dT%H%M%SZ")
|
475 |
+
dtstart = start.strftime("%Y%m%d")
|
476 |
+
uid = str(uuid.uuid4()) + "@cghf"
|
477 |
+
lines += [
|
478 |
+
"BEGIN:VEVENT",
|
479 |
+
f"UID:{uid}",
|
480 |
+
f"DTSTAMP:{dtstamp}",
|
481 |
+
f"DTSTART;VALUE=DATE:{dtstart}",
|
482 |
+
f"SUMMARY:{title} - Week {m['week']}: {m['deliverable']}",
|
483 |
+
f"DESCRIPTION:{m['goal']}",
|
484 |
+
"END:VEVENT"
|
485 |
+
]
|
486 |
+
lines.append("END:VCALENDAR")
|
487 |
+
out = os.path.join(APP_DIR, "spike.ics")
|
488 |
+
with open(out, "w") as f:
|
489 |
+
f.write("\n".join(lines))
|
490 |
+
return out
|
491 |
+
|
492 |
+
def draw_gantt(milestones):
|
493 |
+
fig, ax = plt.subplots(figsize=(7,2+0.3*len(milestones)))
|
494 |
+
for i, m in enumerate(milestones):
|
495 |
+
ax.barh(i, 1, left=m["week"], height=0.4)
|
496 |
+
ax.text(m["week"]+0.05, i, f"W{m['week']} {m['deliverable']}", va="center")
|
497 |
+
ax.set_yticks(range(len(milestones)))
|
498 |
+
ax.set_yticklabels([f"W{m['week']}" for m in milestones])
|
499 |
+
ax.set_xlabel("Week")
|
500 |
+
ax.set_title("Spike Timeline")
|
501 |
+
buf = io.BytesIO()
|
502 |
+
plt.tight_layout()
|
503 |
+
plt.savefig(buf, format="png")
|
504 |
+
plt.close(fig)
|
505 |
+
buf.seek(0)
|
506 |
+
return buf.getvalue()
|
507 |
+
|
508 |
+
def portfolio_pdf(essay_fb, resume_df, colleges_df, spike_title, milestones, emotions_summary):
|
509 |
+
pdf = FPDF()
|
510 |
+
pdf.add_page()
|
511 |
+
pdf.set_font("Arial","B",16)
|
512 |
+
pdf.cell(0,10,"CollegeGenius HF β Portfolio", ln=True)
|
513 |
+
|
514 |
+
pdf.set_font("Arial","",12)
|
515 |
+
pdf.cell(0,8,"Essay Summary", ln=True)
|
516 |
+
for k in ["score","word_count","grammar_avg","redundancy_avg","coherence"]:
|
517 |
+
if k in essay_fb:
|
518 |
+
pdf.cell(0,6,f"- {k}: {essay_fb[k]}", ln=True)
|
519 |
+
|
520 |
+
pdf.ln(3)
|
521 |
+
pdf.cell(0,8,"Resume Highlights (top 5)", ln=True)
|
522 |
+
try:
|
523 |
+
rows = resume_df.values.tolist()
|
524 |
+
except Exception:
|
525 |
+
rows = []
|
526 |
+
for r in rows[:5]:
|
527 |
+
pdf.multi_cell(0,6,f"β’ [{r[1]}] {r[0]}")
|
528 |
+
if r[2]:
|
529 |
+
pdf.multi_cell(0,6,f" Suggestion: {r[2]}")
|
530 |
+
|
531 |
+
pdf.ln(3)
|
532 |
+
pdf.cell(0,8,"College Matches", ln=True)
|
533 |
+
try:
|
534 |
+
for _, row in colleges_df.head(6).iterrows():
|
535 |
+
pdf.cell(0,6,f"- {row['name']} ({row['tier']}) | Fit={round(row['fit'],3)}", ln=True)
|
536 |
+
except Exception:
|
537 |
+
pass
|
538 |
+
|
539 |
+
pdf.ln(3)
|
540 |
+
pdf.cell(0,8,f"Spike: {spike_title}", ln=True)
|
541 |
+
for m in milestones[:6]:
|
542 |
+
pdf.cell(0,6,f"Week {m['week']}: {m['deliverable']} β {m['goal']}", ln=True)
|
543 |
+
|
544 |
+
pdf.ln(3)
|
545 |
+
pdf.cell(0,8,"Interview Emotions", ln=True)
|
546 |
+
for k,v in emotions_summary.items():
|
547 |
+
pdf.cell(0,6,f"- {k}: {round(v,3)}", ln=True)
|
548 |
+
|
549 |
+
out = os.path.join(APP_DIR, "portfolio.pdf")
|
550 |
+
pdf.output(out)
|
551 |
+
return out
|
552 |
+
|
553 |
+
# -----------------------------
|
554 |
+
# Viz helper
|
555 |
+
# -----------------------------
|
556 |
+
def bars_image(labels, values, title=""):
|
557 |
+
fig, ax = plt.subplots()
|
558 |
+
ax.bar(labels, values)
|
559 |
+
ax.set_title(title)
|
560 |
+
ax.set_ylim(0, max(1, max(values) if values else 1))
|
561 |
+
buf = io.BytesIO()
|
562 |
+
plt.tight_layout()
|
563 |
+
plt.savefig(buf, format="png")
|
564 |
+
plt.close(fig)
|
565 |
+
buf.seek(0)
|
566 |
+
return buf.getvalue()
|
567 |
+
|
568 |
+
# -----------------------------
|
569 |
+
# Spike pipeline (also returns milestones json)
|
570 |
+
# -----------------------------
|
571 |
+
def spike_pipeline(interest, weeks, title_hint):
|
572 |
+
ideas, milestones = build_spike(interest, weeks)
|
573 |
+
ics_path = export_ics(title_hint or f"{interest} Spike", milestones)
|
574 |
+
md = "### Spike Plan\n" + "\n".join([f"- W{m['week']}: {m['deliverable']} β {m['goal']}" for m in milestones])
|
575 |
+
gantt = draw_gantt(milestones)
|
576 |
+
return md, ideas, ics_path, gantt, json.dumps(milestones)
|
577 |
+
|
578 |
+
# -----------------------------
|
579 |
+
# Build PDF pipeline
|
580 |
+
# -----------------------------
|
581 |
+
def essay_to_pdf_pipeline(essay_json, resume_df, college_df, spike_title, milestones_json, emo_json):
|
582 |
+
try:
|
583 |
+
essay = json.loads(essay_json)
|
584 |
+
except Exception:
|
585 |
+
essay = {}
|
586 |
+
try:
|
587 |
+
mil = json.loads(milestones_json)
|
588 |
+
except Exception:
|
589 |
+
mil = [{"week":1,"goal":"Start","deliverable":"Kickoff"}]
|
590 |
+
try:
|
591 |
+
emo = json.loads(emo_json) if emo_json else {}
|
592 |
+
except Exception:
|
593 |
+
emo = {}
|
594 |
+
out = portfolio_pdf(essay, resume_df if isinstance(resume_df,pd.DataFrame) else pd.DataFrame(), college_df if isinstance(college_df,pd.DataFrame) else COLLEGES, spike_title or "Spike", mil, emo)
|
595 |
+
return out
|
596 |
+
|
597 |
+
# -----------------------------
|
598 |
+
# Gradio UI
|
599 |
+
# -----------------------------
|
600 |
+
with gr.Blocks(title="CollegeGenius HF", theme=gr.themes.Soft()) as demo:
|
601 |
+
gr.Markdown("# π CollegeGenius HF β All Local, All Hugging Face\nNo API keys. Everything runs with open models from the Hugging Face Hub.")
|
602 |
+
with gr.Tab("π Essay Coach"):
|
603 |
+
prompt = gr.Textbox(label="Prompt (optional)", lines=2)
|
604 |
+
essay_text = gr.Textbox(label="Paste essay text", lines=14)
|
605 |
+
upload_essay = gr.File(label="Upload image/PDF (optional; OCR is local-only)")
|
606 |
+
btn_essay = gr.Button("Analyze Essay", variant="primary")
|
607 |
+
essay_md = gr.Markdown()
|
608 |
+
essay_plot = gr.Image(label="Essay Diagnostics")
|
609 |
+
essay_excerpt = gr.Textbox(label="First 1200 chars", lines=6)
|
610 |
+
essay_json = gr.Textbox(label="Essay JSON (for PDF)", lines=6)
|
611 |
+
btn_essay.click(essay_pipeline, inputs=[prompt, essay_text, upload_essay],
|
612 |
+
outputs=[essay_md, essay_plot, essay_excerpt, essay_json])
|
613 |
+
|
614 |
+
with gr.Tab("π Resume & Activities"):
|
615 |
+
resume_text = gr.Textbox(label="Paste resume bullets (one per line)", lines=10)
|
616 |
+
upload_resume = gr.File(label="Upload image/PDF (optional)")
|
617 |
+
btn_resume = gr.Button("Analyze Resume", variant="primary")
|
618 |
+
resume_df = gr.Dataframe(headers=["Bullet","Score","Suggestion"], interactive=False, wrap=True)
|
619 |
+
resume_img = gr.Image(label="Quality Snapshot")
|
620 |
+
resume_md = gr.Markdown()
|
621 |
+
btn_resume.click(resume_pipeline, inputs=[resume_text, upload_resume],
|
622 |
+
outputs=[resume_df, resume_img, resume_md])
|
623 |
+
|
624 |
+
with gr.Tab("π« College Matcher"):
|
625 |
+
with gr.Row():
|
626 |
+
gpa = gr.Textbox(label="Unweighted GPA (e.g., 3.8)")
|
627 |
+
sat = gr.Textbox(label="SAT (1600)")
|
628 |
+
act = gr.Textbox(label="ACT (36)")
|
629 |
+
with gr.Row():
|
630 |
+
interest = gr.Dropdown(["STEM","Humanities","Business","Biology","CS","Engineering"], value="STEM", label="Interest")
|
631 |
+
state_pref = gr.Textbox(label="State preference (optional)")
|
632 |
+
btn_match = gr.Button("Find Matches", variant="primary")
|
633 |
+
college_df = gr.Dataframe(interactive=False, wrap=True)
|
634 |
+
college_plot = gr.Image(label="Top Fit")
|
635 |
+
btn_match.click(college_pipeline, inputs=[gpa, sat, act, interest, state_pref], outputs=[college_df, college_plot])
|
636 |
+
|
637 |
+
with gr.Tab("π Spike Planner"):
|
638 |
+
with gr.Row():
|
639 |
+
sp_interest = gr.Dropdown(list(SPIKE_IDEAS.keys()), value="AI/ML", label="Interest")
|
640 |
+
weeks = gr.Slider(4, 16, value=8, step=1, label="Weeks")
|
641 |
+
sp_title = gr.Textbox(label="Spike Title (optional)")
|
642 |
+
btn_spike = gr.Button("Generate Plan", variant="primary")
|
643 |
+
sp_md = gr.Markdown()
|
644 |
+
sp_ideas = gr.HighlightedText(label="Idea Starters", combine_adjacent=True)
|
645 |
+
sp_ics = gr.File(label="Calendar (.ics)")
|
646 |
+
sp_gantt = gr.Image(label="Gantt")
|
647 |
+
sp_json = gr.Textbox(label="Milestones JSON", lines=6)
|
648 |
+
btn_spike.click(spike_pipeline, inputs=[sp_interest, weeks, sp_title],
|
649 |
+
outputs=[sp_md, sp_ideas, sp_ics, sp_gantt, sp_json])
|
650 |
+
|
651 |
+
with gr.Tab("π€ Interview Practice"):
|
652 |
+
major = gr.Dropdown(list(MAJOR_QUESTIONS.keys()), value="Computer Science", label="Major")
|
653 |
+
behavioral = gr.Checkbox(value=True, label="Behavioral?")
|
654 |
+
answer = gr.Textbox(label="Your answer (type to analyze emotions)", lines=8)
|
655 |
+
btn_interview = gr.Button("Get Question + Analyze", variant="primary")
|
656 |
+
iv_q = gr.Textbox(label="Question", lines=2)
|
657 |
+
iv_md = gr.Markdown()
|
658 |
+
iv_img = gr.Image(label="Emotion Scores")
|
659 |
+
btn_interview.click(interview_pipeline, inputs=[major, behavioral, answer], outputs=[iv_q, iv_md, iv_img])
|
660 |
+
|
661 |
+
with gr.Tab("π Build Portfolio PDF"):
|
662 |
+
gr.Markdown("Combine everything into a shareable PDF.")
|
663 |
+
spike_title_in = gr.Textbox(label="Spike Title", value="My Spike")
|
664 |
+
emo_json = gr.Textbox(label="Emotions JSON (from Interview tab; optional)", lines=4)
|
665 |
+
btn_pdf = gr.Button("Build PDF", variant="primary")
|
666 |
+
pdf_out = gr.File()
|
667 |
+
btn_pdf.click(essay_to_pdf_pipeline, inputs=[essay_json, resume_df, college_df, spike_title_in, sp_json, emo_json], outputs=[pdf_out])
|
668 |
+
|
669 |
+
app = demo
|
670 |
+
|
671 |
+
if __name__ == "__main__":
|
672 |
+
demo.queue(concurrency_count=2, max_size=32).launch()
|