Spaces:
Build error
Build error
import streamlit as st | |
import requests | |
import PyPDF2 | |
from typing import Optional, Dict, List | |
import json | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from concurrent.futures import ThreadPoolExecutor | |
import xml.etree.ElementTree as ET | |
import re | |
from datetime import datetime | |
import time | |
from dotenv import load_dotenv | |
import os | |
import pandas as pd | |
# Load environment variables | |
load_dotenv() | |
PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY") | |
PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions" | |
SAPLING_API_KEY = os.getenv("SAPLING_API_KEY") | |
def call_perplexity_api(prompt: str) -> str: | |
"""Call Perplexity AI with a prompt, return the text response if successful.""" | |
headers = { | |
"Authorization": f"Bearer {PERPLEXITY_API_KEY}", | |
"Content-Type": "application/json", | |
} | |
payload = { | |
"model": "llama-3.1-sonar-small-128k-chat", | |
"messages": [{"role": "user", "content": prompt}], | |
"temperature": 0.3, | |
} | |
try: | |
response = requests.post(PERPLEXITY_API_URL, headers=headers, json=payload) | |
response.raise_for_status() | |
return response.json()["choices"][0]["message"]["content"] | |
except Exception as e: | |
st.error(f"API Error: {str(e)}") | |
return "" | |
def extract_text_from_pdf(pdf_file): | |
"""Extract text content from a PDF file.""" | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
def analyze_paper(text: str, category: str) -> str: | |
"""Generate a prompt and get analysis for a specific category.""" | |
prompts = { | |
"Summarized Abstract": "Extract and summarize the abstract from this research paper:", | |
"Results": "What are the main results and findings from this research paper:", | |
"Summarized Introduction": "Summarize the introduction section of this research paper:", | |
"Methods Used": "What are the main methods and methodologies used in this research:", | |
"Literature Survey": "Summarize the literature review or related work from this paper:", | |
"Limitations": "What are the limitations mentioned in this research:", | |
"Contributions": "What are the main contributions of this research:", | |
"Practical Implications": "What are the practical implications of this research:", | |
"Objectives": "What are the main objectives of this research:", | |
"Findings": "What are the key findings from this research:", | |
"Future Research": "What future research directions are suggested in this paper:", | |
"Dependent Variables": "What are the dependent variables studied in this research:", | |
"Independent Variables": "What are the independent variables studied in this research:", | |
"Dataset": "What dataset(s) were used in this research:", | |
"Problem Statement": "What is the main problem statement or research question:", | |
"Challenges": "What challenges were faced or addressed in this research:", | |
"Applications": "What are the potential applications of this research:", | |
} | |
prompt = f"{prompts[category]}\n\nPaper text: {text[:5000]}" # Limit text to avoid token limits | |
return call_perplexity_api(prompt) | |
class ResearchAssistant: | |
def __init__(self, perplexity_key: str): | |
self.perplexity_key = perplexity_key | |
def chat_with_pdf(self, pdf_text: str, query: str) -> Dict: | |
chunks = self._split_text(pdf_text) | |
relevant_chunks = self._get_relevant_chunks(chunks, query) | |
prompt = f"Context from PDF:\n\n{relevant_chunks}\n\nQuestion: {query}" | |
response_text = call_perplexity_api(prompt) | |
return {"choices": [{"message": {"content": response_text}}]} | |
def generate_literature_review(self, topic: str) -> Dict: | |
try: | |
# Search arXiv for papers | |
papers = self._search_arxiv(topic) | |
if not papers: | |
return {"error": "No papers found on the topic"} | |
# Format paper information | |
papers_summary = "\n\n".join( | |
[ | |
f"Paper: {p['title']}\nAuthors: {', '.join(p['authors'])}\nSummary: {p['summary']}" | |
for p in papers | |
] | |
) | |
prompt = f"""Generate a comprehensive literature review on '{topic}'. Based on these papers: | |
{papers_summary} | |
Structure the review as follows: | |
1. Introduction and Background | |
2. Current Research Trends | |
3. Key Findings and Themes | |
4. Research Gaps | |
5. Future Directions""" | |
response_text = call_perplexity_api(prompt) | |
return {"choices": [{"message": {"content": response_text}}]} | |
except Exception as e: | |
return {"error": f"Literature review generation failed: {str(e)}"} | |
def ai_writer(self, outline: str, references: List[str]) -> Dict: | |
prompt = f"""Write a research paper following this structure: | |
Outline: | |
{outline} | |
References to incorporate: | |
{json.dumps(references)} | |
Instructions: | |
- Follow academic writing style | |
- Include appropriate citations | |
- Maintain logical flow | |
- Include introduction and conclusion""" | |
response_text = call_perplexity_api(prompt) | |
return {"choices": [{"message": {"content": response_text}}]} | |
def refine_response(self, response: str, column: str) -> str: | |
prompt = f"""Refine the following response to fit the '{column}' column in a research paper CSV format: | |
Response: {response} | |
Ensure the response is clear, concise, and fits the context of the column.""" | |
refined_response = call_perplexity_api(prompt) | |
return refined_response | |
def paraphrase(self, text: str) -> Dict: | |
prompt = f"""Paraphrase the following text while: | |
- Maintaining academic tone | |
- Preserving key meaning | |
- Improving clarity | |
Text: {text}""" | |
response_text = call_perplexity_api(prompt) | |
return {"choices": [{"message": {"content": response_text}}]} | |
def generate_citation(self, paper_info: Dict, style: str = "APA") -> Dict: | |
prompt = f"""Generate a {style} citation for: | |
Title: {paper_info['title']} | |
Authors: {', '.join(paper_info['authors'])} | |
Year: {paper_info['year']} | |
Follow exact {style} format guidelines.""" | |
response_text = call_perplexity_api(prompt) | |
return {"citation": response_text} | |
def detect_ai_content(self, text: str) -> Dict: | |
prompt = f"""You are an AI content detector. Analyze the text for: | |
1. Writing style consistency | |
2. Language patterns | |
3. Contextual coherence | |
4. Common AI patterns | |
Provide a clear analysis with confidence level. | |
Text: {text}""" | |
response = requests.post( | |
"https://api.sapling.ai/api/v1/aidetect", | |
json={"key": SAPLING_API_KEY, "text": text}, | |
) | |
st.info( | |
"A score from 0 to 1 will be returned, with 0 indicating the maximum confidence that the text is human-written, and 1 indicating the maximum confidence that the text is AI-generated." | |
) | |
if response.status_code == 200: | |
return {"choices": [{"message": {"content": response.json()}}]} | |
else: | |
return { | |
"error": f"Sapling API Error: {response.status_code} - {response.text}" | |
} | |
def _split_text(self, text: str) -> List[str]: | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", ". ", " ", ""] | |
) | |
return splitter.split_text(text) | |
def _get_relevant_chunks(self, chunks: List[str], query: str) -> str: | |
# Simple keyword-based relevance scoring | |
query_words = set(query.lower().split()) | |
scored_chunks = [] | |
for chunk in chunks: | |
chunk_words = set(chunk.lower().split()) | |
score = len(query_words.intersection(chunk_words)) | |
scored_chunks.append((score, chunk)) | |
scored_chunks.sort(reverse=True) | |
return "\n\n".join(chunk for _, chunk in scored_chunks[:3]) | |
def _search_arxiv(self, topic: str) -> List[Dict]: | |
try: | |
query = "+AND+".join(topic.split()) | |
url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results=5" | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
return self._parse_arxiv_response(response.text) | |
except Exception as e: | |
print(f"arXiv search failed: {str(e)}") | |
return [] | |
def _parse_arxiv_response(self, response_text: str) -> List[Dict]: | |
try: | |
root = ET.fromstring(response_text) | |
papers = [] | |
for entry in root.findall("{http://www.w3.org/2005/Atom}entry"): | |
paper = { | |
"id": entry.find("{http://www.w3.org/2005/Atom}id").text, | |
"title": entry.find( | |
"{http://www.w3.org/2005/Atom}title" | |
).text.strip(), | |
"summary": entry.find( | |
"{http://www.w3.org/2005/Atom}summary" | |
).text.strip(), | |
"authors": [ | |
author.find("{http://www.w3.org/2005/Atom}name").text.strip() | |
for author in entry.findall( | |
"{http://www.w3.org/2005/Atom}author" | |
) | |
], | |
"published": entry.find( | |
"{http://www.w3.org/2005/Atom}published" | |
).text[:10], | |
} | |
papers.append(paper) | |
return papers | |
except Exception as e: | |
print(f"arXiv response parsing failed: {str(e)}") | |
return [] | |
def main(): | |
# st.set_page_config(page_title="Research Assistant", layout="wide") | |
st.title("Research Copilot") | |
if not PERPLEXITY_API_KEY: | |
st.warning("Perplexity API key not found in environment variables.") | |
return | |
assistant = ResearchAssistant(PERPLEXITY_API_KEY) | |
tabs = st.tabs( | |
[ | |
"Chat with PDF", | |
"Literature Review", | |
"AI Writer", | |
"Extract Data", | |
"Paraphraser", | |
"Citation Generator", | |
"AI Detector", | |
] | |
) | |
with tabs[0]: # Chat with PDF | |
st.header("Chat with PDF") | |
# File uploader with clear button | |
col1, col2 = st.columns([3, 1]) | |
with col1: | |
uploaded_file = st.file_uploader("Upload PDF", type="pdf", key="pdf_chat") | |
with col2: | |
if st.button("Clear PDF"): | |
st.session_state.pop("pdf_text", None) | |
st.rerun() | |
if uploaded_file: | |
if "pdf_text" not in st.session_state: | |
with st.spinner("Processing PDF..."): | |
reader = PyPDF2.PdfReader(uploaded_file) | |
st.session_state.pdf_text = "" | |
for page in reader.pages: | |
st.session_state.pdf_text += page.extract_text() | |
st.success("PDF processed successfully!") | |
query = st.text_input("Ask a question about the PDF") | |
if query: | |
with st.spinner("Analyzing..."): | |
response = assistant.chat_with_pdf(st.session_state.pdf_text, query) | |
if "error" in response: | |
st.error(response["error"]) | |
else: | |
st.write(response["choices"][0]["message"]["content"]) | |
with tabs[1]: # Literature Review | |
st.header("Literature Review") | |
topic = st.text_input("Enter research topic") | |
if st.button("Generate Review") and topic: | |
with st.spinner("Generating literature review..."): | |
review = assistant.generate_literature_review(topic) | |
if "error" in review: | |
st.error(review["error"]) | |
else: | |
st.write(review["choices"][0]["message"]["content"]) | |
with tabs[2]: # AI Writer | |
st.header("AI Writer") | |
outline = st.text_area("Enter paper outline") | |
references = st.text_area("Enter references (one per line)") | |
if st.button("Generate Paper") and outline: | |
with st.spinner("Writing paper..."): | |
paper = assistant.ai_writer(outline, references.split("\n")) | |
if "error" in paper: | |
st.error(paper["error"]) | |
else: | |
st.write(paper["choices"][0]["message"]["content"]) | |
with tabs[3]: # Extract Data | |
st.header("Extract Data") | |
uploaded_files = st.file_uploader( | |
"Upload multiple PDF files", type="pdf", accept_multiple_files=True | |
) | |
if uploaded_files: | |
if st.button("Process Papers"): | |
# Initialize progress bar | |
progress_bar = st.progress(0) | |
status_text = st.empty() | |
# Initialize results dictionary | |
results = [] | |
# Define categories | |
categories = [ | |
"Summarized Abstract", | |
"Results", | |
"Summarized Introduction", | |
"Methods Used", | |
"Literature Survey", | |
"Limitations", | |
"Contributions", | |
"Practical Implications", | |
"Objectives", | |
"Findings", | |
"Future Research", | |
"Dependent Variables", | |
"Independent Variables", | |
"Dataset", | |
"Problem Statement", | |
"Challenges", | |
"Applications", | |
] | |
# Process each file | |
for i, file in enumerate(uploaded_files): | |
status_text.text(f"Processing {file.name}...") | |
# Extract text from PDF | |
text = extract_text_from_pdf(file) | |
# Initialize paper results | |
paper_results = {"Filename": file.name} | |
# Analyze each category | |
for j, category in enumerate(categories): | |
status_text.text(f"Processing {file.name} - {category}") | |
paper_results[category] = analyze_paper(text, category) | |
# Update progress | |
progress = (i * len(categories) + j + 1) / ( | |
len(uploaded_files) * len(categories) | |
) | |
progress_bar.progress(progress) | |
# Add small delay to avoid API rate limits | |
time.sleep(1) | |
results.append(paper_results) | |
# Create DataFrame | |
df = pd.DataFrame(results) | |
# Convert DataFrame to CSV | |
csv = df.to_csv(index=False) | |
# Create download button | |
st.download_button( | |
label="Download Results as CSV", | |
data=csv, | |
file_name="research_papers_analysis.csv", | |
mime="text/csv", | |
) | |
# Display results in the app | |
st.subheader("Analysis Results") | |
st.dataframe(df) | |
status_text.text("Processing complete!") | |
progress_bar.progress(1.0) | |
with tabs[4]: # Paraphraser | |
st.header("Paraphraser") | |
text = st.text_area("Enter text to paraphrase") | |
if st.button("Paraphrase") and text: | |
with st.spinner("Paraphrasing..."): | |
result = assistant.paraphrase(text) | |
if "error" in result: | |
st.error(result["error"]) | |
else: | |
st.write(result["choices"][0]["message"]["content"]) | |
with tabs[5]: # Citation Generator | |
st.header("Citation Generator") | |
col1, col2 = st.columns(2) | |
with col1: | |
title = st.text_input("Paper Title") | |
authors = st.text_input("Authors (comma-separated)") | |
with col2: | |
year = st.text_input("Year") | |
style = st.selectbox("Citation Style", ["APA", "MLA", "Chicago"]) | |
if st.button("Generate Citation") and title: | |
with st.spinner("Generating citation..."): | |
citation = assistant.generate_citation( | |
{ | |
"title": title, | |
"authors": [a.strip() for a in authors.split(",")], | |
"year": year, | |
}, | |
style, | |
) | |
if "error" in citation: | |
st.error(citation["error"]) | |
else: | |
st.code(citation["citation"], language="text") | |
with tabs[6]: # AI Detector | |
st.header("AI Detector") | |
text = st.text_area("Enter text to analyze") | |
if st.button("Detect AI Content") and text: | |
with st.spinner("Analyzing..."): | |
result = assistant.detect_ai_content(text) | |
if "error" in result: | |
st.error(result["error"]) | |
else: | |
st.write(result["choices"][0]["message"]["content"]) | |
if __name__ == "__main__": | |
main() | |