Spaces:

joshstrupp
/

Self-Help-Book-Recommendation-Engine

Sleeping

App Files Files Community

Josh Strupp commited on May 9

Commit

077a7f8

1 Parent(s): b0e35b7

update app

Browse files

Files changed (1) hide show

app.py +211 -96

app.py CHANGED Viewed

@@ -1,113 +1,228 @@
 import gradio as gr
 import pandas as pd
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-def recommend_books(concern, top_n=5, reviews_per_book=2):
-    # Load and preprocess data
-    df = pd.read_csv('self_help_books.csv')
-    # Create TF-IDF vectors from reviews
-    tfidf = TfidfVectorizer(stop_words='english')
-    review_vectors = tfidf.fit_transform(df['Review'].fillna(''))
-    concern_vector = tfidf.transform([concern])
-    # Calculate similarity scores
-    similarities = cosine_similarity(concern_vector, review_vectors).flatten()
-    # Get top books based on review similarity
-    top_indices = np.argsort(similarities)[-top_n:][::-1]
-    recommended_books = df.iloc[top_indices].copy()
-    # Add helpful and harmful reviews
-    for idx, row in recommended_books.iterrows():
-        book_reviews = df[df['Book'] == row['Book']]
-        # Get helpful reviews
-        helpful_reviews = book_reviews.nlargest(reviews_per_book, 'Helpful_Ratio')['Review'].tolist()
-        recommended_books.at[idx, 'Helpful Reviews'] = helpful_reviews
-        # Get critical reviews
-        harmful_reviews = book_reviews.nsmallest(reviews_per_book, 'Helpful_Ratio')['Review'].tolist()
-        recommended_books.at[idx, 'Harmful Reviews'] = harmful_reviews
-    return recommended_books
-def recommend_authors(concern, top_n=5):
-    df = pd.read_csv('self_help_books.csv')
-    # Calculate author metrics
-    author_stats = df.groupby('author_clean').agg({
-        'Helpful_Ratio': ['mean', 'count']
-    }).reset_index()
-    author_stats.columns = ['author_clean', 'helpful_ratio', 'review_count']
-    # Filter authors with minimum reviews
-    min_reviews = 5
-    author_stats = author_stats[author_stats['review_count'] >= min_reviews]
-    # Get top and bottom authors
-    good_authors = author_stats.nlargest(top_n, 'helpful_ratio')
-    risky_authors = author_stats.nsmallest(top_n, 'helpful_ratio')
-    return good_authors, risky_authors
-def recommend_for_concern(concern, num_books=5, num_reviews=2):
-    """Wrapper function to format recommendations for Gradio"""
-    books_df = recommend_books(concern, top_n=num_books, reviews_per_book=num_reviews)
-    good_authors, risky_authors = recommend_authors(concern, top_n=num_books)
-    # Format book recommendations
-    book_output = "=== RECOMMENDED BOOKS ===\n\n"
-    for _, book in books_df.iterrows():
-        book_output += f"📚 {book['Book']}\n"
-        book_output += f"👤 Author: {book['Author']}\n"
-        book_output += f"⭐ Rating: {book['Star_Rating']}\n"
-        book_output += f"💰 Price: ${book['Price']}\n"
-        book_output += f"📊 Helpful Ratio: {book['Helpful_Ratio']:.2f}\n"
-        if book['Helpful Reviews']:
-            book_output += "\n✅ Helpful Reviews:\n"
-            for review in book['Helpful Reviews']:
-                book_output += f"• {review}\n"
-        if book['Harmful Reviews']:
-            book_output += "\n⚠️ Critical Reviews:\n"
-            for review in book['Harmful Reviews']:
-                book_output += f"• {review}\n"
-        book_output += "\n" + "-"*50 + "\n\n"
-    # Format author recommendations
-    author_output = "=== RECOMMENDED AUTHORS ===\n\n"
-    author_output += "✅ Authors Likely to be Helpful:\n"
-    for _, author in good_authors.iterrows():
-        author_output += f"• {author['author_clean']} (Helpful ratio: {author['helpful_ratio']:.2f})\n"
-    author_output += "\n⚠️ Authors to Approach with Caution:\n"
-    for _, author in risky_authors.iterrows():
-        author_output += f"• {author['author_clean']} (Helpful ratio: {author['helpful_ratio']:.2f})\n"
-    return book_output + "\n\n" + author_output
-# Create the Gradio interface
 iface = gr.Interface(
     fn=recommend_for_concern,
     inputs=[
-        gr.Textbox(label="What concern or fear would you like help with?", placeholder="e.g. I'm a lonely teenager"),
-        gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Number of recommendations"),
-        gr.Slider(minimum=1, maximum=5, value=2, step=1, label="Reviews per book")
     ],
     outputs=gr.Textbox(label="Recommendations", lines=20),
-    title="Self-Help Book Recommender",
-    description="Get personalized book recommendations based on your concerns or fears.",
     examples=[
         ["I'm a lonely teenager", 5, 2],
         ["I'm worried about my career", 5, 2],
-        ["I have anxiety about the future", 5, 2]
-    ]
 )
 iface.launch()

 import gradio as gr
 import pandas as pd
 import numpy as np
+from pathlib import Path
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+# ---------------------------------------------------------------------------
+# 0. LOAD DATA PRE-GENERATED BY THE OFFLINE PIPELINE
+# ---------------------------------------------------------------------------
+BOOKS_CSV   = Path("self_help_books.csv")
+REVIEWS_CSV = Path("self_help_reviews.csv")          # may be absent - optional
+df_books   = pd.read_csv(BOOKS_CSV)
+df_reviews = pd.read_csv(REVIEWS_CSV) if REVIEWS_CSV.exists() else pd.DataFrame()
+# ---------------------------------------------------------------------------
+# 1. VERY LIGHT TEXT PRE-PROCESSING + TF-IDF FEATURES
+# ---------------------------------------------------------------------------
+def _prep(text: str) -> str:
+    """Lower-case & cast NaNs to an empty string."""
+    return str(text).lower() if pd.notnull(text) else ""
+# Build the text that summarises each book (only if not already present)
+if "combined_text" not in df_books.columns:
+    df_books["combined_text"] = (
+        df_books["summary"].apply(_prep) + " " +
+        df_books["genres"].apply(_prep)  + " " +
+        df_books["key_cat_primary"].apply(_prep)
+    )
+vectorizer = TfidfVectorizer(stop_words="english", max_features=50_000)
+X_BOOKS    = vectorizer.fit_transform(df_books["combined_text"])
+# ---------------------------------------------------------------------------
+# 2. AUTHOR-LEVEL AGGREGATION  (fallbacks if columns are missing)
+# ---------------------------------------------------------------------------
+if {"helpful_ratio", "total_reviews"}.issubset(df_books.columns):
+    author_stats = (
+        df_books.groupby("author_clean")
+        .agg(helpful_ratio=("helpful_ratio", "mean"),
+             total_reviews=("total_reviews", "sum"))
+        .reset_index()
+    )
+else:  # keep the code functional even without those columns
+    author_stats = pd.DataFrame(
+        columns=["author_clean", "helpful_ratio", "total_reviews"]
+    )
+# ---------------------------------------------------------------------------
+# 3. MAIN RECOMMENDATION FUNCTIONS
+# ---------------------------------------------------------------------------
+def recommend_books(user_issue: str,
+                    top_n: int = 5,
+                    reviews_per_book: int = 2,
+                    min_reviews: int = 10) -> pd.DataFrame:
+    """
+    Blend topical similarity (70 %) with helpfulness (30 %)
+    and return the `top_n` books best suited to `user_issue`.
+    """
+    # ---- similarity -------------------------------------------------------
+    query_vec  = vectorizer.transform([user_issue.lower()])
+    similarity = cosine_similarity(query_vec, X_BOOKS).ravel()
+    df_temp = df_books.copy()
+    df_temp["similarity"] = similarity
+    df_temp["helpful_ratio_filled"] = df_temp.get("helpful_ratio", 0).fillna(0)
+    if "total_reviews" in df_temp.columns:
+        df_temp = df_temp[df_temp["total_reviews"] >= min_reviews]
+    df_temp["score"] = (
+        0.70 * df_temp["similarity"] +
+        0.30 * df_temp["helpful_ratio_filled"]
+    )
+    top_books = df_temp.nlargest(top_n, "score").reset_index(drop=True)
+    # ---- representative reviews ------------------------------------------
+    results = []
+    for _, row in top_books.iterrows():
+        name   = row.get("name", row.get("Book", ""))
+        author = row.get("author_clean", row.get("Author", ""))
+        # sample reviews only if we actually have them
+        if not df_reviews.empty and {"is_helpful", "is_harmful"}.issubset(df_reviews.columns):
+            helpful_mask  = (df_reviews["name"] == name) & (df_reviews["is_helpful"])
+            harmful_mask  = (df_reviews["name"] == name) & (df_reviews["is_harmful"])
+            helpful_reviews = (
+                df_reviews[helpful_mask]
+                .sample(min(reviews_per_book, helpful_mask.sum()), random_state=42)
+                ["review_text"].tolist()
+                if helpful_mask.any() else []
+            )
+            harmful_reviews = (
+                df_reviews[harmful_mask]
+                .sample(min(reviews_per_book, harmful_mask.sum()), random_state=42)
+                ["review_text"].tolist()
+                if harmful_mask.any() else []
+            )
+        else:
+            helpful_reviews, harmful_reviews = [], []
+        results.append({
+            "Book"            : name,
+            "Author"          : author,
+            "Star_Rating"     : row.get("star_rating", np.nan),
+            "Price"           : row.get("kindle_price_clean", np.nan),
+            "Helpful_Ratio"   : round(row.get("helpful_ratio", 0), 3),
+            "Similarity"      : round(row["similarity"], 3),
+            "Helpful Reviews" : helpful_reviews,
+            "Harmful Reviews" : harmful_reviews
+        })
+    return pd.DataFrame(results)
+def recommend_authors(user_issue: str,
+                      top_n: int = 5,
+                      min_reviews: int = 30):
+    """
+    Return two DataFrames:
+        • authors likely to be helpful
+        • authors you might approach with caution
+    Ranking = 70 % topical relevance + 30 % helpfulness.
+    """
+    query_vec  = vectorizer.transform([user_issue.lower()])
+    similarity = cosine_similarity(query_vec, X_BOOKS).ravel()
+    rel_df = pd.DataFrame({
+        "author_clean": df_books["author_clean"],
+        "sim_to_issue": similarity
+    })
+    author_relevance = (
+        rel_df.groupby("author_clean")
+        .agg(max_sim=("sim_to_issue", "max"))
+        .reset_index()
+    )
+    merged = author_relevance.merge(author_stats, on="author_clean", how="left")
+    merged["helpful_ratio"] = merged["helpful_ratio"].fillna(0)
+    merged["total_reviews"] = merged["total_reviews"].fillna(0)
+    merged = merged[merged["total_reviews"] >= min_reviews]
+    merged["score"] = 0.70 * merged["max_sim"] + 0.30 * merged["helpful_ratio"]
+    helpful_authors = (
+        merged[merged["helpful_ratio"] >= 0.5]
+        .nlargest(top_n, "score")
+        .reset_index(drop=True)
+    )
+    risky_authors = (
+        merged[merged["helpful_ratio"] < 0.5]
+        .nlargest(top_n, "score")
+        .reset_index(drop=True)
+    )
+    return helpful_authors, risky_authors
+# ---------------------------------------------------------------------------
+# 4. GRADIO GLUE – format nicely & expose a simple interface
+# ---------------------------------------------------------------------------
+def _format_output(books_df, good_authors, bad_authors) -> str:
+    txt = "=== RECOMMENDED BOOKS ===\n\n"
+    for _, bk in books_df.iterrows():
+        txt += f"📚 {bk['Book']}\n"
+        txt += f"👤 Author: {bk['Author']}\n"
+        txt += f"⭐ Rating: {bk['Star_Rating']}\n"
+        txt += f"💰 Price: ${bk['Price']}\n"
+        txt += f"📊 Helpful Ratio: {bk['Helpful_Ratio']:.2f}\n"
+        if bk["Helpful Reviews"]:
+            txt += "\n✅ Helpful Reviews:\n"
+            for rv in bk["Helpful Reviews"]:
+                txt += f"• {rv}\n"
+        if bk["Harmful Reviews"]:
+            txt += "\n⚠️ Critical Reviews:\n"
+            for rv in bk["Harmful Reviews"]:
+                txt += f"• {rv}\n"
+        txt += "\n" + "-" * 50 + "\n\n"
+    txt += "=== RECOMMENDED AUTHORS ===\n\n"
+    txt += "✅ Authors Likely to be Helpful:\n"
+    for _, au in good_authors.iterrows():
+        txt += f"• {au['author_clean']} (Helpful ratio: {au['helpful_ratio']:.2f})\n"
+    txt += "\n⚠️ Authors to Approach with Caution:\n"
+    for _, au in bad_authors.iterrows():
+        txt += f"• {au['author_clean']} (Helpful ratio: {au['helpful_ratio']:.2f})\n"
+    return txt
+def recommend_for_concern(concern: str,
+                          num_books: int = 5,
+                          num_reviews: int = 2) -> str:
+    books_df = recommend_books(concern,
+                               top_n=num_books,
+                               reviews_per_book=num_reviews)
+    good_authors, bad_authors = recommend_authors(concern,
+                                                  top_n=num_books)
+    return _format_output(books_df, good_authors, bad_authors)
+# ---------------------------------------------------------------------------
+# 5. LAUNCH GRADIO
+# ---------------------------------------------------------------------------
 iface = gr.Interface(
     fn=recommend_for_concern,
     inputs=[
+        gr.Textbox(label="What concern or fear would you like help with?",
+                   placeholder="e.g. I'm a lonely teenager"),
+        gr.Slider(label="Number of recommendations",
+                  minimum=1, maximum=10, step=1, value=5),
+        gr.Slider(label="Reviews per book",
+                  minimum=1, maximum=5, step=1, value=2),
     ],
     outputs=gr.Textbox(label="Recommendations", lines=20),
+    title="Self-Help Book Recommendation Engine",
+    description="Personalised, review-aware book & author suggestions.",
     examples=[
         ["I'm a lonely teenager", 5, 2],
         ["I'm worried about my career", 5, 2],
+        ["I have anxiety about the future", 5, 2],
+    ],
 )
 iface.launch()