Spaces:

noumanjavaid
/

new-themes

Sleeping

App Files Files Community

noumanjavaid commited on Mar 25

Commit

3cd24d7

verified ·

1 Parent(s): 1f717b7

Update themes-ui.py

Browse files

Files changed (1) hide show

themes-ui.py +751 -523

themes-ui.py CHANGED Viewed

@@ -1,545 +1,773 @@
-from __future__ import annotations
-import collections
-import contextlib
 import sys
-from collections.abc import Iterable, AsyncIterable
-import dataclasses
-import itertools
-import textwrap
-from typing import TypedDict, Union
-import google.protobuf.json_format
-import google.api_core.exceptions
-from google.ai import generativelanguage as glm
-from google.generativeai import string_utils
-__all__ = [
-    "AsyncGenerateContentResponse",
-    "BlockedPromptException",
-    "StopCandidateException",
-    "IncompleteIterationError",
-    "BrokenResponseError",
-    "GenerationConfigDict",
-    "GenerationConfigType",
-    "GenerationConfig",
-    "GenerateContentResponse",
-]
-if sys.version_info < (3, 10):
-    def aiter(obj):
-        return obj.__aiter__()
-    async def anext(obj, default=None):
-        try:
-            return await obj.__anext__()
-        except StopAsyncIteration:
-            if default is not None:
-                return default
-            else:
-                raise
-class BlockedPromptException(Exception):
-    pass
-class StopCandidateException(Exception):
-    pass
-class IncompleteIterationError(Exception):
-    pass
-class BrokenResponseError(Exception):
-    pass
-class GenerationConfigDict(TypedDict):
-    # TODO(markdaoust): Python 3.11+ use `NotRequired`, ref: https://peps.python.org/pep-0655/
-    candidate_count: int
-    stop_sequences: Iterable[str]
-    max_output_tokens: int
-    temperature: float
-@dataclasses.dataclass
-class GenerationConfig:
-    """A simple dataclass used to configure the generation parameters of `GenerativeModel.generate_content`.
-    Attributes:
-        candidate_count:
-            Number of generated responses to return.
-        stop_sequences:
-            The set of character sequences (up
-            to 5) that will stop output generation. If
-            specified, the API will stop at the first
-            appearance of a stop sequence. The stop sequence
-            will not be included as part of the response.
-        max_output_tokens:
-            The maximum number of tokens to include in a
-            candidate.
-            If unset, this will default to output_token_limit specified
-            in the model's specification.
-        temperature:
-            Controls the randomness of the output. Note: The
-            default value varies by model, see the `Model.temperature`
-            attribute of the `Model` returned the `genai.get_model`
-            function.
-            Values can range from [0.0,1.0], inclusive. A value closer
-            to 1.0 will produce responses that are more varied and
-            creative, while a value closer to 0.0 will typically result
-            in more straightforward responses from the model.
-        top_p:
-            Optional. The maximum cumulative probability of tokens to
-            consider when sampling.
-            The model uses combined Top-k and nucleus sampling.
-            Tokens are sorted based on their assigned probabilities so
-            that only the most likely tokens are considered. Top-k
-            sampling directly limits the maximum number of tokens to
-            consider, while Nucleus sampling limits number of tokens
-            based on the cumulative probability.
-            Note: The default value varies by model, see the
-            `Model.top_p` attribute of the `Model` returned the
-            `genai.get_model` function.
-        top_k (int):
-            Optional. The maximum number of tokens to consider when
-            sampling.
-            The model uses combined Top-k and nucleus sampling.
-            Top-k sampling considers the set of `top_k` most probable
-            tokens. Defaults to 40.
-            Note: The default value varies by model, see the
-            `Model.top_k` attribute of the `Model` returned the
-            `genai.get_model` function.
-    """
-    candidate_count: int | None = None
-    stop_sequences: Iterable[str] | None = None
-    max_output_tokens: int | None = None
-    temperature: float | None = None
-    top_p: float | None = None
-    top_k: int | None = None
-GenerationConfigType = Union[glm.GenerationConfig, GenerationConfigDict, GenerationConfig]
-def to_generation_config_dict(generation_config: GenerationConfigType):
-    if generation_config is None:
-        return {}
-    elif isinstance(generation_config, glm.GenerationConfig):
-        return type(generation_config).to_dict(generation_config)  # pytype: disable=attribute-error
-    elif isinstance(generation_config, GenerationConfig):
-        generation_config = dataclasses.asdict(generation_config)
-        return {key: value for key, value in generation_config.items() if value is not None}
-    elif hasattr(generation_config, "keys"):
-        return dict(generation_config)
-    else:
-        raise TypeError(
-            "Did not understand `generation_config`, expected a `dict` or"
-            f" `GenerationConfig`\nGot type: {type(generation_config)}\nValue:"
-            f" {generation_config}"
-        )
-def _join_citation_metadatas(
-    citation_metadatas: Iterable[glm.CitationMetadata],
-):
-    citation_metadatas = list(citation_metadatas)
-    return citation_metadatas[-1]
-def _join_safety_ratings_lists(
-    safety_ratings_lists: Iterable[list[glm.SafetyRating]],
-):
-    ratings = {}
-    blocked = collections.defaultdict(list)
-    for safety_ratings_list in safety_ratings_lists:
-        for rating in safety_ratings_list:
-            ratings[rating.category] = rating.probability
-            blocked[rating.category].append(rating.blocked)
-    blocked = {category: any(blocked) for category, blocked in blocked.items()}
-    safety_list = []
-    for (category, probability), blocked in zip(ratings.items(), blocked.values()):
-        safety_list.append(
-            glm.SafetyRating(category=category, probability=probability, blocked=blocked)
         )
-    return safety_list
-def _join_contents(contents: Iterable[glm.Content]):
-    contents = tuple(contents)
-    roles = [c.role for c in contents if c.role]
-    if roles:
-        role = roles[0]
-    else:
-        role = ""
-    parts = []
-    for content in contents:
-        parts.extend(content.parts)
-    merged_parts = [parts.pop(0)]
-    for part in parts:
-        if not merged_parts[-1].text:
-            merged_parts.append(part)
-            continue
-        if not part.text:
-            merged_parts.append(part)
-            continue
-        merged_part = glm.Part(merged_parts[-1])
-        merged_part.text += part.text
-        merged_parts[-1] = merged_part
-    return glm.Content(
-        role=role,
-        parts=merged_parts,
     )
-def _join_candidates(candidates: Iterable[glm.Candidate]):
-    candidates = tuple(candidates)
-    index = candidates[0].index  # These should all be the same.
-    return glm.Candidate(
-        index=index,
-        content=_join_contents([c.content for c in candidates]),
-        finish_reason=candidates[-1].finish_reason,
-        safety_ratings=_join_safety_ratings_lists([c.safety_ratings for c in candidates]),
-        citation_metadata=_join_citation_metadatas([c.citation_metadata for c in candidates]),
     )
-def _join_candidate_lists(candidate_lists: Iterable[list[glm.Candidate]]):
-    # Assuming that is a candidate ends, it is no longer returned in the list of
-    # candidates and that's why candidates have an index
-    candidates = collections.defaultdict(list)
-    for candidate_list in candidate_lists:
-        for candidate in candidate_list:
-            candidates[candidate.index].append(candidate)
-    new_candidates = []
-    for index, candidate_parts in sorted(candidates.items()):
-        new_candidates.append(_join_candidates(candidate_parts))
-    return new_candidates
-def _join_prompt_feedbacks(
-    prompt_feedbacks: Iterable[glm.GenerateContentResponse.PromptFeedback],
-):
-    # Always return the first prompt feedback.
-    return next(iter(prompt_feedbacks))
-def _join_chunks(chunks: Iterable[glm.GenerateContentResponse]):
-    return glm.GenerateContentResponse(
-        candidates=_join_candidate_lists(c.candidates for c in chunks),
-        prompt_feedback=_join_prompt_feedbacks(c.prompt_feedback for c in chunks),
     )
-_INCOMPLETE_ITERATION_MESSAGE = """\
-Please let the response complete iteration before accessing the final accumulated
-attributes (or call `response.resolve()`)"""
-class BaseGenerateContentResponse:
-    def __init__(
-        self,
-        done: bool,
-        iterator: (
-            None
-            | Iterable[glm.GenerateContentResponse]
-            | AsyncIterable[glm.GenerateContentResponse]
-        ),
-        result: glm.GenerateContentResponse,
-        chunks: Iterable[glm.GenerateContentResponse] | None = None,
-    ):
-        self._done = done
-        self._iterator = iterator
-        self._result = result
-        if chunks is None:
-            self._chunks = [result]
-        else:
-            self._chunks = list(chunks)
-        if result.prompt_feedback.block_reason:
-            self._error = BlockedPromptException(result)
-        else:
-            self._error = None
-    @property
-    def candidates(self):
-        """The list of candidate responses.
-        Raises:
-            IncompleteIterationError: With `stream=True` if iteration over the stream was not completed.
-        """
-        if not self._done:
-            raise IncompleteIterationError(_INCOMPLETE_ITERATION_MESSAGE)
-        return self._result.candidates
-    @property
-    def parts(self):
-        """A quick accessor equivalent to `self.candidates[0].parts`
-        Raises:
-            ValueError: If the candidate list does not contain exactly one candidate.
-        """
-        candidates = self.candidates
-        if not candidates:
-            raise ValueError(
-                "The `response.parts` quick accessor only works for a single candidate, "
-                "but none were returned. Check the `response.prompt_feedback` to see if the prompt was blocked."
-            )
-        if len(candidates) > 1:
-            raise ValueError(
-                "The `response.parts` quick accessor only works with a "
-                "single candidate. With multiple candidates use "
-                "result.candidates[index].text"
-            )
-        parts = candidates[0].content.parts
-        return parts
-    @property
-    def text(self):
-        """A quick accessor equivalent to `self.candidates[0].parts[0].text`
-        Raises:
-            ValueError: If the candidate list or parts list does not contain exactly one entry.
-        """
-        parts = self.parts
-        if not parts:
-            raise ValueError(
-                "The `response.text` quick accessor only works when the response contains a valid "
-                "`Part`, but none was returned. Check the `candidate.safety_ratings` to see if the "
-                "response was blocked."
-            )
-        return parts[0].text
-    @property
-    def prompt_feedback(self):
-        return self._result.prompt_feedback
-    def __str__(self) -> str:
-        if self._done:
-            _iterator = "None"
-        else:
-            _iterator = f"<{self._iterator.__class__.__name__}>"
-        _result = f"glm.GenerateContentResponse({type(self._result).to_dict(self._result)})"
-        if self._error:
-            _error = f",\nerror=<{self._error.__class__.__name__}> {self._error}"
-        else:
-            _error = ""
-        return (
-            textwrap.dedent(
-                f"""\
-                response:
-                {type(self).__name__}(
-                    done={self._done},
-                    iterator={_iterator},
-                    result={_result},
-                )"""
-            )
-            + _error
-        )
-    __repr__ = __str__
-@contextlib.contextmanager
-def rewrite_stream_error():
     try:
-        yield
-    except (google.protobuf.json_format.ParseError, AttributeError) as e:
-        raise google.api_core.exceptions.BadRequest(
-            "Unknown error trying to retrieve streaming response. "
-            "Please retry with `stream=False` for more details."
-        )
-GENERATE_CONTENT_RESPONSE_DOC = """Instances of this class manage the response of the `generate_content` method.
-    These are returned by `GenerativeModel.generate_content` and `ChatSession.send_message`.
-    This object is based on the low level `glm.GenerateContentResponse` class which just has `prompt_feedback`
-    and `candidates` attributes. This class adds several quick accessors for common use cases.
-    The same object type is returned for both `stream=True/False`.
-    ### Streaming
-    When you pass `stream=True` to `GenerativeModel.generate_content` or `ChatSession.send_message`,
-    iterate over this object to receive chunks of the response:
-    ```
-    response = model.generate_content(..., stream=True):
-    for chunk in response:
-      print(chunk.text)
-    ```
-    `GenerateContentResponse.prompt_feedback` is available immediately but
-    `GenerateContentResponse.candidates`, and all the attributes derived from them (`.text`, `.parts`),
-    are only available after the iteration is complete.
     """
-ASYNC_GENERATE_CONTENT_RESPONSE_DOC = (
-    """This is the async version of `genai.GenerateContentResponse`."""
-)
-@string_utils.set_doc(GENERATE_CONTENT_RESPONSE_DOC)
-class GenerateContentResponse(BaseGenerateContentResponse):
-    @classmethod
-    def from_iterator(cls, iterator: Iterable[glm.GenerateContentResponse]):
-        iterator = iter(iterator)
-        with rewrite_stream_error():
-            response = next(iterator)
-        return cls(
-            done=False,
-            iterator=iterator,
-            result=response,
-        )
-    @classmethod
-    def from_response(cls, response: glm.GenerateContentResponse):
-        return cls(
-            done=True,
-            iterator=None,
-            result=response,
-        )
-    def __iter__(self):
-        # This is not thread safe.
-        if self._done:
-            for chunk in self._chunks:
-                yield GenerateContentResponse.from_response(chunk)
-            return
-        # Always have the next chunk available.
-        if len(self._chunks) == 0:
-            self._chunks.append(next(self._iterator))
-        for n in itertools.count():
-            if self._error:
-                raise self._error
-            if n >= len(self._chunks) - 1:
-                # Look ahead for a new item, so that you know the stream is done
-                # when you yield the last item.
-                if self._done:
-                    return
-                try:
-                    item = next(self._iterator)
-                except StopIteration:
-                    self._done = True
-                except Exception as e:
-                    self._error = e
-                    self._done = True
                 else:
-                    self._chunks.append(item)
-                    self._result = _join_chunks([self._result, item])
-            item = self._chunks[n]
-            item = GenerateContentResponse.from_response(item)
-            yield item
-    def resolve(self):
-        if self._done:
-            return
-        for _ in self:
-            pass
-@string_utils.set_doc(ASYNC_GENERATE_CONTENT_RESPONSE_DOC)
-class AsyncGenerateContentResponse(BaseGenerateContentResponse):
-    @classmethod
-    async def from_aiterator(cls, iterator: AsyncIterable[glm.GenerateContentResponse]):
-        iterator = aiter(iterator)  # type: ignore
-        with rewrite_stream_error():
-            response = await anext(iterator)  # type: ignore
-        return cls(
-            done=False,
-            iterator=iterator,
-            result=response,
         )
-    @classmethod
-    def from_response(cls, response: glm.GenerateContentResponse):
-        return cls(
-            done=True,
-            iterator=None,
-            result=response,
         )
-    async def __aiter__(self):
-        # This is not thread safe.
-        if self._done:
-            for chunk in self._chunks:
-                yield GenerateContentResponse.from_response(chunk)
-            return
-        # Always have the next chunk available.
-        if len(self._chunks) == 0:
-            self._chunks.append(await anext(self._iterator))  # type: ignore
-        for n in itertools.count():
-            if self._error:
-                raise self._error
-            if n >= len(self._chunks) - 1:
-                # Look ahead for a new item, so that you know the stream is done
-                # when you yield the last item.
-                if self._done:
-                    return
-                try:
-                    item = await anext(self._iterator)  # type: ignore
-                except StopAsyncIteration:
-                    self._done = True
-                except Exception as e:
-                    self._error = e
-                    self._done = True
-                else:
-                    self._chunks.append(item)
-                    self._result = _join_chunks([self._result, item])
-            item = self._chunks[n]
-            item = GenerateContentResponse.from_response(item)
-            yield item
-    async def resolve(self):
-        if self._done:
-            return
-        async for _ in self:
-            pass

+import streamlit as st
+import requests
+import pandas as pd
+import time
+import json
+import re
 import sys
+import subprocess
+from datetime import datetime, date, timedelta
+from urllib.parse import urlencode
+from typing import Dict, List, Optional
+import google.generativeai as genai
+import plotly.express as px
+# Handle required package installations and imports
+def install_package(package_name: str) -> None:
+    try:
+        subprocess.check_call(
+            [sys.executable, "-m", "pip", "install", package_name],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL
         )
+    except subprocess.CalledProcessError as e:
+        st.error(f"Failed to install {package_name}: {str(e)}")
+        sys.exit(1)
+try:
+    import google.generativeai as genai
+except ImportError:
+    install_package("google-generativeai")
+    import google.generativeai as genai
+# Import Plotly
+try:
+    import plotly.express as px
+except ImportError:
+    install_package("plotly")
+    import plotly.express as px
+# Set up the application title and layout
+st.set_page_config(
+    page_title="Steam App Reviews - Themes Analysis",
+    page_icon="🎮",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS to improve UI
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem !important;
+        color: #1e88e5;
+    }
+    .theme-card {
+        background-color: #f5f5f5;
+        border-radius: 10px;
+        padding: 1.5rem;
+        margin-bottom: 1rem;
+        border-left: 5px solid #1e88e5;
+    }
+    .theme-title {
+        font-size: 1.2rem;
+        font-weight: bold;
+        color: #1e88e5;
+    }
+    .theme-desc {
+        color: #424242;
+        margin: 0.5rem 0;
+    }
+    .theme-count {
+        font-size: 0.9rem;
+        color: #616161;
+    }
+    .sentiment-positive {
+        background-color: #D5EAD8;
+        color: #2E8B57;
+        padding: 3px 8px;
+        border-radius: 10px;
+    }
+    .sentiment-negative {
+        background-color: #FFE4E1;
+        color: #CD5C5C;
+        padding: 3px 8px;
+        border-radius: 10px;
+    }
+    .sentiment-mixed {
+        background-color: #FFF8DC;
+        color: #DAA520;
+        padding: 3px 8px;
+        border-radius: 10px;
+    }
+    .app-info {
+        background-color: #f0f8ff;
+        border-radius: 10px;
+        padding: 1rem;
+        margin-bottom: 1rem;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Title and description
+st.markdown('<h1 class="main-header">🎮 Steam App Reviews - Themes Analysis</h1>', unsafe_allow_html=True)
+st.markdown("""
+This tool analyzes user reviews for Steam games to identify common themes, sentiments, and feedback patterns.
+Upload your Google Gemini API key, enter a Steam App ID, select a date range, and get valuable insights from user reviews.
+""")
+# Initialize session state variables
+if 'reviews_data' not in st.session_state:
+    st.session_state['reviews_data'] = None
+if 'themes_df' not in st.session_state:
+    st.session_state['themes_df'] = None
+if 'app_info' not in st.session_state:
+    st.session_state['app_info'] = None
+# Sidebar inputs for user interaction
+st.sidebar.header("User Input Parameters")
+# User input for Google Gemini API key
+api_key_input = st.sidebar.text_input(
+    "Enter your Google Gemini API Key:",
+    type="password",
+    help="Your API key will be used to access the Google Gemini API for theme extraction.",
+)
+# Initialize Google Gemini client
+gemini_client = None
+if api_key_input:
+    try:
+        genai.configure(api_key=api_key_input)
+        model = genai.GenerativeModel(model_name='gemini-1.5-pro')
+        gemini_client = model
+        st.sidebar.success("Gemini API connection established!")
+    except Exception as e:
+        st.sidebar.error(f"Error initializing Gemini API: {str(e)}")
+else:
+    st.sidebar.warning("Please enter your Google Gemini API Key to proceed.")
+# User input for App ID
+appid = st.sidebar.text_input(
+    "Enter the Steam App ID:",
+    value="1782120",
+    help="Find the App ID in the URL of the game's Steam page."
+)
+# Validate App ID
+def is_valid_app_id(app_id: str) -> bool:
+    if not app_id or not app_id.isdigit():
+        return False
+    return True
+if not is_valid_app_id(appid):
+    st.sidebar.error("Please enter a valid Steam App ID (numeric only).")
+# Date input for selecting a range
+st.sidebar.write("Select the date range for reviews:")
+start_date = st.sidebar.date_input(
+    "Start Date",
+    value=datetime.today() - timedelta(days=7)
+)
+end_date = st.sidebar.date_input(
+    "End Date",
+    value=datetime.today()
+)
+# Validate date range
+if start_date and end_date:
+    today = date.today()
+    # Check if end date is in the future
+    if end_date > today:
+        st.sidebar.error("Error: End date cannot be in the future.")
+        st.stop()
+    # Check if start date is after end date
+    if start_date > end_date:
+        st.sidebar.error("Error: Start date must be before end date.")
+        st.stop()
+    # Check if date range is too large
+    date_range = (end_date - start_date).days
+    if date_range > 365:
+        st.sidebar.warning("Warning: Large date ranges may result in incomplete data due to Steam API limitations.")
+    elif date_range < 0:
+        st.sidebar.error("Error: Invalid date range selected.")
+        st.stop()
+# Maximum reviews to fetch
+max_reviews = st.sidebar.slider(
+    "Maximum reviews to fetch:",
+    min_value=50,
+    max_value=500,
+    value=200,
+    step=50,
+    help="Higher values may take longer to process."
+)
+# Language filter
+language_filter = st.sidebar.multiselect(
+    "Filter by languages:",
+    options=["english", "spanish", "french", "german", "italian", "russian", "all"],
+    default=["english"],
+    help="Select 'all' to include all languages or choose specific languages."
+)
+# Advanced options
+advanced_options = st.sidebar.expander("Advanced Analysis Options")
+with advanced_options:
+    include_sentiment = st.checkbox(
+        "Include sentiment analysis",
+        value=True,
+        help="Analyze the sentiment of each review and theme."
     )
+    cluster_similar_themes = st.checkbox(
+        "Cluster similar themes",
+        value=True,
+        help="Group themes that are semantically similar."
     )
+    min_mention_threshold = st.slider(
+        "Minimum reviews per theme:",
+        min_value=1,
+        max_value=10,
+        value=2,
+        help="Only show themes mentioned in at least this many reviews."
     )
+# Function to fetch app information
+@st.cache_data(ttl=3600, show_spinner=False)
+def get_app_info(app_id: str) -> Optional[Dict]:
+    """
+    Fetches information about a Steam game using its App ID.
+    """
     try:
+        url = f"https://store.steampowered.com/api/appdetails?appids={app_id}"
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        data = response.json()
+        if data.get(app_id, {}).get('success', False):
+            app_data = data[app_id]['data']
+            return {
+                'name': app_data.get('name', 'Unknown Game'),
+                'header_image': app_data.get('header_image', ''),
+                'release_date': app_data.get('release_date', {}).get('date', 'Unknown'),
+                'developers': app_data.get('developers', ['Unknown']),
+                'publishers': app_data.get('publishers', ['Unknown'])
+            }
+        return None
+    except Exception as e:
+        st.sidebar.error(f"Error fetching app info: {str(e)}")
+        return None
+# Function to fetch reviews
+@st.cache_data(ttl=1800, show_spinner=False)
+def fetch_reviews(app_id: str, start_timestamp: int, end_timestamp: int,
+                max_reviews: int = 1000, language_filter: List[str] = ["english"]) -> Optional[List]:
     """
+    Fetches Steam reviews for the specified app within the given date range.
+    Implements batch processing and caching for efficient handling of large volumes.
+    """
+    # Define the base API URL
+    base_url = f"https://store.steampowered.com/appreviews/{app_id}?json=1"
+    # Normalize language filter and handle 'all' case
+    normalized_language_filter = [lang.lower() for lang in language_filter]
+    use_all_languages = "all" in normalized_language_filter
+    # Calculate day range dynamically based on start and end timestamps
+    day_range = min(365, (end_timestamp - start_timestamp) // 86400 + 1)
+    # Define initial API parameters with optimized batch size
+    params = {
+        "filter": "updated",  # Use 'updated' to get all reviews in date range
+        "language": "all" if use_all_languages else ",".join(normalized_language_filter),
+        "day_range": str(day_range),
+        "review_type": "all",
+        "purchase_type": "all",
+        "num_per_page": "100",  # Maximum allowed by Steam API
+        "cursor": "*",
+        "filter_offtopic_activity": 0,
+        "start_date": start_timestamp,
+        "end_date": end_timestamp
+    }
+    # Initialize cache for review batches
+    if 'review_cache' not in st.session_state:
+        st.session_state.review_cache = {}
+    cache_key = f"{app_id}_{start_timestamp}_{end_timestamp}_{language_filter}"
+    # Check cache first
+    if cache_key in st.session_state.review_cache:
+        cached_reviews = st.session_state.review_cache[cache_key]
+        if len(cached_reviews) >= max_reviews:
+            return cached_reviews[:max_reviews]
+    reviews_list = []
+    request_count = 0
+    max_requests = 100  # Increased limit for larger datasets
+    retry_attempts = 3   # Number of retry attempts for failed requests
+    batch_size = 100     # Size of each batch
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    # Create a container for batch progress
+    batch_container = st.empty()
+    while True:
+        # URL encode the cursor parameter
+        params_encoded = params.copy()
+        params_encoded["cursor"] = params["cursor"].replace("+", "%2B")
+        # Construct the full URL with parameters
+        url = base_url + "&" + urlencode(params_encoded)
+        try:
+            for attempt in range(retry_attempts):
+                response = requests.get(url, timeout=15)
+                response.raise_for_status()
+                data = response.json()
+                # Check if we have any reviews
+                reviews = data.get('reviews')
+                if not reviews:
+                    status_text.warning("No reviews found for the specified date range and filters.")
+                    return []
+                # Process reviews
+                for review in reviews:
+                    timestamp = review.get("timestamp_created", 0)
+                    review_language = review.get("language", "").lower()
+                    # Validate timestamp is within range
+                    is_in_timerange = start_timestamp <= timestamp <= end_timestamp
+                    # Check language filter
+                    is_valid_language = "all" in language_filter or review_language in [lang.lower() for lang in language_filter]
+                    if is_in_timerange and is_valid_language:
+                        reviews_list.append(review)
+                # Update progress
+                progress = min(len(reviews_list) / max_reviews * 100, 100)
+                progress_bar.progress(int(progress))
+                status_text.text(f"Fetched {len(reviews_list)} reviews...")
+                # Check if we've reached max reviews or earlier timestamp
+                if len(reviews_list) >= max_reviews:
+                    break
+                if any(r.get("timestamp_created", 0) < start_timestamp for r in reviews):
+                    break
+                # Update cursor for next batch
+                new_cursor = data.get("cursor")
+                if new_cursor is None or params["cursor"] == new_cursor:
+                    break
+                params["cursor"] = new_cursor
+                # Handle rate limiting
+                if 'X-Rate-Limit-Remaining' in response.headers:
+                    remaining_calls = int(response.headers['X-Rate-Limit-Remaining'])
+                    time.sleep(0.5 if remaining_calls < 10 else 0.2)
                 else:
+                    time.sleep(0.2)
+                # Update batch progress
+                batch_container.text(f"Processing batch {request_count + 1} of {max_requests} (max)")
+                # Check request limits
+                request_count += 1
+                if request_count >= max_requests:
+                    status_text.warning("Reached maximum number of requests. Some reviews may not be fetched.")
+                    break
+                break  # Success - exit retry loop
+        except requests.exceptions.RequestException as e:
+            status_text.error(f"Steam API Error: {str(e)}")
+            if attempt < retry_attempts - 1:
+                time.sleep(1)  # Wait before retrying
+                continue
+            return None
+        # Clean up progress indicators
+        progress_bar.empty()
+        status_text.empty()
+        batch_container.empty()
+        # Cache and return results
+        st.session_state.review_cache[cache_key] = reviews_list
+        return reviews_list
+# Function to extract themes using Google Gemini 1.5 Pro
+def extract_themes(df: pd.DataFrame,
+                  include_sentiment: bool = True,
+                  cluster_similar_themes: bool = True,
+                  min_mention_threshold: int = 2) -> Optional[pd.DataFrame]:
+    """
+    Uses Google Gemini 1.5 Pro to identify the most common themes in reviews.
+    Implements batched processing and caching for large datasets.
+    """
+    if len(df) == 0:
+        st.error("No reviews to analyze.")
+        return None
+    # Get counts of positive and negative reviews (if available)
+    positive_count = 0
+    negative_count = 0
+    if "Recommended" in df.columns:
+        positive_count = df["Recommended"].sum()
+        negative_count = len(df) - positive_count
+    # Initialize theme cache
+    if 'theme_cache' not in st.session_state:
+        st.session_state.theme_cache = {}
+    # Calculate cache key based on review content hash
+    cache_key = hash(tuple(sorted(df['Review'].values)))
+    # Check cache first
+    if cache_key in st.session_state.theme_cache:
+        return st.session_state.theme_cache[cache_key]
+    # Process reviews in batches to handle large datasets
+    batch_size = 200  # Optimal batch size for Gemini API
+    total_batches = (len(df) + batch_size - 1) // batch_size
+    all_themes = []
+    progress_bar = st.progress(0)
+    batch_status = st.empty()
+    for batch_idx in range(total_batches):
+        start_idx = batch_idx * batch_size
+        end_idx = min(start_idx + batch_size, len(df))
+        df_batch = df.iloc[start_idx:end_idx]
+    # Combine reviews into a single string with IDs
+    reviews_text = "\n\n".join([
+        f"Review ID: {row['Review ID']}\nReview Text: {row['Review']}"
+        for _, row in df.iterrows()
+    ])
+    # Prepare the prompt
+    sentiment_instruction = "For each theme, analyze the sentiment (Positive, Negative, or Mixed)." if include_sentiment else ""
+    clustering_instruction = "Cluster similar themes together." if cluster_similar_themes else ""
+    # Fix the JSON template structure
+    sentiment_field = '"Sentiment": "Positive/Negative/Mixed",' if include_sentiment else ""
+    prompt = f"""
+    Analyze these {len(df)} user reviews for a game with {positive_count} positive and {negative_count} negative reviews.
+    Identify significant themes. {clustering_instruction}
+    For each theme:
+    1. Provide a concise, specific name
+    2. Write a detailed description summarizing user feedback
+    3. List the Review IDs where the theme is mentioned
+    4. {sentiment_instruction}
+    Only include themes mentioned in at least {min_mention_threshold} different reviews.
+    Provide the output as a JSON array with the following structure:
+    [
+        {{
+            "Theme": "theme_name",
+            "Description": "detailed_description",
+            "Review IDs": ["id1", "id2", ...],
+            {sentiment_field}
+        }},
+        ...
+    ]
+    Reviews:
+    {reviews_text}
+    """
+    # Call Google Gemini 1.5 Pro
+    try:
+        with st.spinner("Analyzing themes with Google Gemini 1.5 Pro..."):
+            response = model.generate_content(prompt)
+            # Extract text from the response
+            if hasattr(response, 'text'):
+                response_text = response.text
+            elif hasattr(response, 'parts') and response.parts:
+                response_text = response.parts[0].text
+            else:
+                response_text = str(response)
+            # Clean and parse the response text
+            # First try to extract JSON from code blocks
+            json_pattern = r'```(?:json)?(.*?)```'
+            json_matches = re.findall(json_pattern, response_text, re.DOTALL)
+            if json_matches:
+                # Use the first JSON block found
+                json_str = json_matches[0].strip()
+            else:
+                # If no code blocks, try to use the entire response as JSON
+                # Remove any markdown formatting or extra whitespace
+                json_str = response_text.strip()
+        # Parse the JSON output
+        themes_data = json.loads(json_str)
+        # Convert to DataFrame and add count column
+        themes_df = pd.DataFrame(themes_data)
+        themes_df["Count"] = themes_df["Review IDs"].apply(len)
+        # Sort themes by count (descending)
+        themes_df = themes_df.sort_values("Count", ascending=False).reset_index(drop=True)
+        return themes_df
+    except Exception as e:
+        st.error(f"Error extracting themes: {str(e)}")
+        st.error("Response from Gemini API:")
+        if 'response' in locals():
+            try:
+                if hasattr(response, 'text'):
+                    error_text = response.text
+                elif hasattr(response, 'parts') and response.parts:
+                    error_text = response.parts[0].text
+                else:
+                    error_text = str(response)
+                st.error(error_text)
+            except Exception as e:
+                st.error(f"Error displaying response: {str(e)}")
+        return None
+# Function to create visualizations
+def create_visualizations(themes_df: pd.DataFrame, reviews_df: pd.DataFrame):
+    """
+    Creates visualizations for the theme analysis.
+    """
+    col1, col2 = st.columns(2)
+    # Theme distribution chart
+    with col1:
+        theme_counts = themes_df[["Theme", "Count"]]
+        fig = px.bar(
+            theme_counts,
+            x="Count", y="Theme", orientation="h",
+            title="Theme Distribution",
         )
+        fig.update_layout(height=400)
+        st.plotly_chart(fig, use_container_width=True)
+    # Sentiment analysis chart (if available)
+    with col2:
+        if "Sentiment" in themes_df.columns:
+            sentiment_counts = themes_df["Sentiment"].value_counts().reset_index()
+            sentiment_counts.columns = ["Sentiment", "Count"]
+            fig = px.pie(
+                sentiment_counts,
+                values="Count", names="Sentiment",
+                title="Theme Sentiment Distribution",
+                color="Sentiment",
+                color_discrete_map={"Positive": "#2E8B57", "Negative": "#CD5C5C", "Mixed": "#DAA520"},
+            )
+            fig.update_layout(height=400)
+            st.plotly_chart(fig, use_container_width=True)
+    # Review timeline (if timestamp available)
+    if "Timestamp" in reviews_df.columns:
+        # Convert timestamp to datetime
+        reviews_df["Date"] = pd.to_datetime(reviews_df["Timestamp"], unit='s')
+        # Group by date and count
+        reviews_by_date = reviews_df.groupby(reviews_df["Date"].dt.date).size().reset_index()
+        reviews_by_date.columns = ["Date", "Count"]
+        # Create timeline chart
+        fig = px.line(
+            reviews_by_date,
+            x="Date", y="Count",
+            title="Reviews Timeline",
+            markers=True
         )
+        st.plotly_chart(fig, use_container_width=True)
+# Validate inputs before processing
+if start_date > end_date:
+    st.error("Error: End date must fall after start date.")
+elif not api_key_input:
+    st.info("Please input your Google Gemini API Key to proceed.")
+elif not is_valid_app_id(appid):
+    st.error("Please enter a valid Steam App ID.")
+else:
+    # Fetch app info
+    if st.session_state['app_info'] is None or st.session_state.get('current_appid') != appid:
+        st.session_state['app_info'] = get_app_info(appid)
+        st.session_state['current_appid'] = appid
+    # Display app info if available
+    if st.session_state['app_info']:
+        app_info = st.session_state['app_info']
+        col1, col2 = st.columns([1, 3])
+        with col1:
+            st.image(app_info['header_image'], width=200)
+        with col2:
+            st.markdown(f"""
+            <div class='app-info'>
+                <h2>{app_info['name']}</h2>
+                <p><strong>Release Date:</strong> {app_info['release_date']}</p>
+                <p><strong>Developers:</strong> {', '.join(app_info['developers'])}</p>
+                <p><strong>Publishers:</strong> {', '.join(app_info['publishers'])}</p>
+            </div>
+            """, unsafe_allow_html=True)
+    # Fetch reviews button
+    if st.button("Fetch and Analyze Reviews", type="primary"):
+        # Convert dates to timestamps
+        start_timestamp = int(time.mktime(start_date.timetuple()))
+        end_timestamp = int(time.mktime((end_date + timedelta(days=1)).timetuple())) - 1  # Include the entire end date
+        # Fetch the reviews
+        with st.spinner("Fetching reviews from Steam..."):
+            reviews_data = fetch_reviews(
+                appid,
+                start_timestamp,
+                end_timestamp,
+                max_reviews=max_reviews,
+                language_filter=language_filter
+            )
+            st.session_state['reviews_data'] = reviews_data
+        # Check if reviews were fetched
+        if reviews_data:
+            st.success(f"Fetched {len(reviews_data)} reviews from App ID {appid}.")
+            # Create a DataFrame from the review data
+            df = pd.DataFrame(
+                [
+                    {
+                        "Review ID": str(review.get("recommendationid")),
+                        "Author SteamID": review.get("author", {}).get("steamid"),
+                        "Language": review.get("language"),
+                        "Review": review.get("review"),
+                        "Recommended": review.get("voted_up", False),
+                        "Votes Helpful": review.get("votes_up", 0),
+                        "Timestamp": review.get("timestamp_created", 0),
+                        "Posted On": datetime.fromtimestamp(
+                            review.get("timestamp_created", 0)
+                        ).strftime("%Y-%m-%d %H:%M:%S"),
+                    }
+                    for review in reviews_data
+                ]
+            )
+            # Extract themes using Google Gemini 1.5 Pro
+            themes_df = extract_themes(
+                df,
+                include_sentiment=include_sentiment,
+                cluster_similar_themes=cluster_similar_themes,
+                min_mention_threshold=min_mention_threshold
+            )
+            st.session_state['themes_df'] = themes_df
+            if themes_df is not None:
+                # Show summary statistics
+                col1, col2, col3, col4 = st.columns(4)
+                with col1:
+                    st.metric("Total Reviews", len(df))
+                with col2:
+                    positive_count = df["Recommended"].sum()
+                    positive_percent = (positive_count / len(df)) * 100 if len(df) > 0 else 0
+                    st.metric("Positive Reviews", f"{positive_count} ({positive_percent:.1f}%)")
+                with col3:
+                    negative_count = len(df) - positive_count
+                    negative_percent = (negative_count / len(df)) * 100 if len(df) > 0 else 0
+                    st.metric("Negative Reviews", f"{negative_count} ({negative_percent:.1f}%)")
+                with col4:
+                    st.metric("Themes Identified", len(themes_df))
+                # Create visualizations
+                create_visualizations(themes_df, df)
+                # Show themes analysis
+                st.markdown("## 📊 Theme Analysis")
+                st.dataframe(themes_df)
+                # Display detailed theme information
+                st.markdown("## 🔍 Detailed Theme Analysis")
+                for index, row in themes_df.iterrows():
+                    theme = row["Theme"]
+                    description = row["Description"]
+                    review_ids = row["Review IDs"]
+                    count = row["Count"]
+                    sentiment = row.get("Sentiment", "Not analyzed")
+                    # Create a sentiment badge with appropriate styling
+                    sentiment_class = ""
+                    if sentiment == "Positive":
+                        sentiment_class = "sentiment-positive"
+                    elif sentiment == "Negative":
+                        sentiment_class = "sentiment-negative"
+                    elif sentiment == "Mixed":
+                        sentiment_class = "sentiment-mixed"
+                    # Display theme card with enhanced formatting
+                    sentiment_html = f'<span class="{sentiment_class}">{sentiment}</span>' if sentiment != "Not analyzed" else ""
+                    st.markdown(f"""
+                    <div class="theme-card">
+                        <div class="theme-title">{theme} {sentiment_html}</div>
+                        <p class="theme-desc">{description}</p>
+                        <div class="theme-count">Mentioned in {count} reviews</div>
+                    </div>
+                    """, unsafe_allow_html=True)
+                    with st.expander(f"View reviews mentioning '{theme}'"):
+                        # Get the reviews that mention the theme
+                        try:
+                            reviews_with_theme = df[df["Review ID"].isin(review_ids)][["Review ID", "Review", "Posted On", "Recommended"]]
+                            st.dataframe(reviews_with_theme, use_container_width=True)
+                        except Exception as e:
+                            st.error(f"Error displaying reviews for theme '{theme}': {str(e)}")
+                # Export options
+                st.markdown("## 📥 Export Results")
+                col1, col2 = st.columns(2)
+                with col1:
+                    # Export reviews as CSV
+                    reviews_csv = df.to_csv(index=False).encode('utf-8')
+                    st.download_button(
+                        label="Download Reviews CSV",
+                        data=reviews_csv,
+                        file_name=f"steam_reviews_{appid}_{start_date}_to_{end_date}.csv",
+                        mime="text/csv"
+                    )
+                with col2:
+                    # Export themes as CSV
+                    themes_csv = themes_df.to_csv(index=False).encode('utf-8')
+                    st.download_button(
+                        label="Download Themes Analysis CSV",
+                        data=themes_csv,
+                        file_name=f"steam_themes_{appid}_{start_date}_to_{end_date}.csv",
+                        mime="text/csv"
+                    )
+            else:
+                st.warning("Failed to extract themes. Please try again or adjust parameters.")
+        else:
+            st.warning("No reviews found for the specified date range and filters.")
+# Display the raw reviews data if available
+if st.session_state['reviews_data'] is not None:
+    with st.expander("View Raw Reviews Data"):
+        reviews_df = pd.DataFrame(
+            [
+                {
+                    "Review ID": str(review.get("recommendationid")),
+                    "Author SteamID": review.get("author", {}).get("steamid"),
+                    "Language": review.get("language"),
+                    "Review": review.get("review"),
+                    "Recommended": review.get("voted_up", False),
+                    "Votes Helpful": review.get("votes_up", 0),
+                    "Posted On": datetime.fromtimestamp(
+                        review.get("timestamp_created", 0)
+                    ).strftime("%Y-%m-%d %H:%M:%S"),
+                }
+                for review in st.session_state['reviews_data']
+            ]
+        )
+        st.dataframe(reviews_df, use_container_width=True)