NovaScholar / sciclone.py
omkar-surve126's picture
Upload 38 files
b91146d verified
import streamlit as st
import requests
import PyPDF2
from typing import Optional, Dict, List
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from concurrent.futures import ThreadPoolExecutor
import xml.etree.ElementTree as ET
import re
from datetime import datetime
import time
from dotenv import load_dotenv
import os
import pandas as pd
# Load environment variables
load_dotenv()
PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions"
SAPLING_API_KEY = os.getenv("SAPLING_API_KEY")
def call_perplexity_api(prompt: str) -> str:
"""Call Perplexity AI with a prompt, return the text response if successful."""
headers = {
"Authorization": f"Bearer {PERPLEXITY_API_KEY}",
"Content-Type": "application/json",
}
payload = {
"model": "llama-3.1-sonar-small-128k-chat",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.3,
}
try:
response = requests.post(PERPLEXITY_API_URL, headers=headers, json=payload)
response.raise_for_status()
return response.json()["choices"][0]["message"]["content"]
except Exception as e:
st.error(f"API Error: {str(e)}")
return ""
def extract_text_from_pdf(pdf_file):
"""Extract text content from a PDF file."""
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
def analyze_paper(text: str, category: str) -> str:
"""Generate a prompt and get analysis for a specific category."""
prompts = {
"Summarized Abstract": "Extract and summarize the abstract from this research paper:",
"Results": "What are the main results and findings from this research paper:",
"Summarized Introduction": "Summarize the introduction section of this research paper:",
"Methods Used": "What are the main methods and methodologies used in this research:",
"Literature Survey": "Summarize the literature review or related work from this paper:",
"Limitations": "What are the limitations mentioned in this research:",
"Contributions": "What are the main contributions of this research:",
"Practical Implications": "What are the practical implications of this research:",
"Objectives": "What are the main objectives of this research:",
"Findings": "What are the key findings from this research:",
"Future Research": "What future research directions are suggested in this paper:",
"Dependent Variables": "What are the dependent variables studied in this research:",
"Independent Variables": "What are the independent variables studied in this research:",
"Dataset": "What dataset(s) were used in this research:",
"Problem Statement": "What is the main problem statement or research question:",
"Challenges": "What challenges were faced or addressed in this research:",
"Applications": "What are the potential applications of this research:",
}
prompt = f"{prompts[category]}\n\nPaper text: {text[:5000]}" # Limit text to avoid token limits
return call_perplexity_api(prompt)
class ResearchAssistant:
def __init__(self, perplexity_key: str):
self.perplexity_key = perplexity_key
def chat_with_pdf(self, pdf_text: str, query: str) -> Dict:
chunks = self._split_text(pdf_text)
relevant_chunks = self._get_relevant_chunks(chunks, query)
prompt = f"Context from PDF:\n\n{relevant_chunks}\n\nQuestion: {query}"
response_text = call_perplexity_api(prompt)
return {"choices": [{"message": {"content": response_text}}]}
def generate_literature_review(self, topic: str) -> Dict:
try:
# Search arXiv for papers
papers = self._search_arxiv(topic)
if not papers:
return {"error": "No papers found on the topic"}
# Format paper information
papers_summary = "\n\n".join(
[
f"Paper: {p['title']}\nAuthors: {', '.join(p['authors'])}\nSummary: {p['summary']}"
for p in papers
]
)
prompt = f"""Generate a comprehensive literature review on '{topic}'. Based on these papers:
{papers_summary}
Structure the review as follows:
1. Introduction and Background
2. Current Research Trends
3. Key Findings and Themes
4. Research Gaps
5. Future Directions"""
response_text = call_perplexity_api(prompt)
return {"choices": [{"message": {"content": response_text}}]}
except Exception as e:
return {"error": f"Literature review generation failed: {str(e)}"}
def ai_writer(self, outline: str, references: List[str]) -> Dict:
prompt = f"""Write a research paper following this structure:
Outline:
{outline}
References to incorporate:
{json.dumps(references)}
Instructions:
- Follow academic writing style
- Include appropriate citations
- Maintain logical flow
- Include introduction and conclusion"""
response_text = call_perplexity_api(prompt)
return {"choices": [{"message": {"content": response_text}}]}
def refine_response(self, response: str, column: str) -> str:
prompt = f"""Refine the following response to fit the '{column}' column in a research paper CSV format:
Response: {response}
Ensure the response is clear, concise, and fits the context of the column."""
refined_response = call_perplexity_api(prompt)
return refined_response
def paraphrase(self, text: str) -> Dict:
prompt = f"""Paraphrase the following text while:
- Maintaining academic tone
- Preserving key meaning
- Improving clarity
Text: {text}"""
response_text = call_perplexity_api(prompt)
return {"choices": [{"message": {"content": response_text}}]}
def generate_citation(self, paper_info: Dict, style: str = "APA") -> Dict:
prompt = f"""Generate a {style} citation for:
Title: {paper_info['title']}
Authors: {', '.join(paper_info['authors'])}
Year: {paper_info['year']}
Follow exact {style} format guidelines."""
response_text = call_perplexity_api(prompt)
return {"citation": response_text}
def detect_ai_content(self, text: str) -> Dict:
prompt = f"""You are an AI content detector. Analyze the text for:
1. Writing style consistency
2. Language patterns
3. Contextual coherence
4. Common AI patterns
Provide a clear analysis with confidence level.
Text: {text}"""
response = requests.post(
"https://api.sapling.ai/api/v1/aidetect",
json={"key": SAPLING_API_KEY, "text": text},
)
st.info(
"A score from 0 to 1 will be returned, with 0 indicating the maximum confidence that the text is human-written, and 1 indicating the maximum confidence that the text is AI-generated."
)
if response.status_code == 200:
return {"choices": [{"message": {"content": response.json()}}]}
else:
return {
"error": f"Sapling API Error: {response.status_code} - {response.text}"
}
def _split_text(self, text: str) -> List[str]:
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", ". ", " ", ""]
)
return splitter.split_text(text)
def _get_relevant_chunks(self, chunks: List[str], query: str) -> str:
# Simple keyword-based relevance scoring
query_words = set(query.lower().split())
scored_chunks = []
for chunk in chunks:
chunk_words = set(chunk.lower().split())
score = len(query_words.intersection(chunk_words))
scored_chunks.append((score, chunk))
scored_chunks.sort(reverse=True)
return "\n\n".join(chunk for _, chunk in scored_chunks[:3])
def _search_arxiv(self, topic: str) -> List[Dict]:
try:
query = "+AND+".join(topic.split())
url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results=5"
response = requests.get(url, timeout=10)
response.raise_for_status()
return self._parse_arxiv_response(response.text)
except Exception as e:
print(f"arXiv search failed: {str(e)}")
return []
def _parse_arxiv_response(self, response_text: str) -> List[Dict]:
try:
root = ET.fromstring(response_text)
papers = []
for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
paper = {
"id": entry.find("{http://www.w3.org/2005/Atom}id").text,
"title": entry.find(
"{http://www.w3.org/2005/Atom}title"
).text.strip(),
"summary": entry.find(
"{http://www.w3.org/2005/Atom}summary"
).text.strip(),
"authors": [
author.find("{http://www.w3.org/2005/Atom}name").text.strip()
for author in entry.findall(
"{http://www.w3.org/2005/Atom}author"
)
],
"published": entry.find(
"{http://www.w3.org/2005/Atom}published"
).text[:10],
}
papers.append(paper)
return papers
except Exception as e:
print(f"arXiv response parsing failed: {str(e)}")
return []
def main():
# st.set_page_config(page_title="Research Assistant", layout="wide")
st.title("Research Copilot")
if not PERPLEXITY_API_KEY:
st.warning("Perplexity API key not found in environment variables.")
return
assistant = ResearchAssistant(PERPLEXITY_API_KEY)
tabs = st.tabs(
[
"Chat with PDF",
"Literature Review",
"AI Writer",
"Extract Data",
"Paraphraser",
"Citation Generator",
"AI Detector",
]
)
with tabs[0]: # Chat with PDF
st.header("Chat with PDF")
# File uploader with clear button
col1, col2 = st.columns([3, 1])
with col1:
uploaded_file = st.file_uploader("Upload PDF", type="pdf", key="pdf_chat")
with col2:
if st.button("Clear PDF"):
st.session_state.pop("pdf_text", None)
st.rerun()
if uploaded_file:
if "pdf_text" not in st.session_state:
with st.spinner("Processing PDF..."):
reader = PyPDF2.PdfReader(uploaded_file)
st.session_state.pdf_text = ""
for page in reader.pages:
st.session_state.pdf_text += page.extract_text()
st.success("PDF processed successfully!")
query = st.text_input("Ask a question about the PDF")
if query:
with st.spinner("Analyzing..."):
response = assistant.chat_with_pdf(st.session_state.pdf_text, query)
if "error" in response:
st.error(response["error"])
else:
st.write(response["choices"][0]["message"]["content"])
with tabs[1]: # Literature Review
st.header("Literature Review")
topic = st.text_input("Enter research topic")
if st.button("Generate Review") and topic:
with st.spinner("Generating literature review..."):
review = assistant.generate_literature_review(topic)
if "error" in review:
st.error(review["error"])
else:
st.write(review["choices"][0]["message"]["content"])
with tabs[2]: # AI Writer
st.header("AI Writer")
outline = st.text_area("Enter paper outline")
references = st.text_area("Enter references (one per line)")
if st.button("Generate Paper") and outline:
with st.spinner("Writing paper..."):
paper = assistant.ai_writer(outline, references.split("\n"))
if "error" in paper:
st.error(paper["error"])
else:
st.write(paper["choices"][0]["message"]["content"])
with tabs[3]: # Extract Data
st.header("Extract Data")
uploaded_files = st.file_uploader(
"Upload multiple PDF files", type="pdf", accept_multiple_files=True
)
if uploaded_files:
if st.button("Process Papers"):
# Initialize progress bar
progress_bar = st.progress(0)
status_text = st.empty()
# Initialize results dictionary
results = []
# Define categories
categories = [
"Summarized Abstract",
"Results",
"Summarized Introduction",
"Methods Used",
"Literature Survey",
"Limitations",
"Contributions",
"Practical Implications",
"Objectives",
"Findings",
"Future Research",
"Dependent Variables",
"Independent Variables",
"Dataset",
"Problem Statement",
"Challenges",
"Applications",
]
# Process each file
for i, file in enumerate(uploaded_files):
status_text.text(f"Processing {file.name}...")
# Extract text from PDF
text = extract_text_from_pdf(file)
# Initialize paper results
paper_results = {"Filename": file.name}
# Analyze each category
for j, category in enumerate(categories):
status_text.text(f"Processing {file.name} - {category}")
paper_results[category] = analyze_paper(text, category)
# Update progress
progress = (i * len(categories) + j + 1) / (
len(uploaded_files) * len(categories)
)
progress_bar.progress(progress)
# Add small delay to avoid API rate limits
time.sleep(1)
results.append(paper_results)
# Create DataFrame
df = pd.DataFrame(results)
# Convert DataFrame to CSV
csv = df.to_csv(index=False)
# Create download button
st.download_button(
label="Download Results as CSV",
data=csv,
file_name="research_papers_analysis.csv",
mime="text/csv",
)
# Display results in the app
st.subheader("Analysis Results")
st.dataframe(df)
status_text.text("Processing complete!")
progress_bar.progress(1.0)
with tabs[4]: # Paraphraser
st.header("Paraphraser")
text = st.text_area("Enter text to paraphrase")
if st.button("Paraphrase") and text:
with st.spinner("Paraphrasing..."):
result = assistant.paraphrase(text)
if "error" in result:
st.error(result["error"])
else:
st.write(result["choices"][0]["message"]["content"])
with tabs[5]: # Citation Generator
st.header("Citation Generator")
col1, col2 = st.columns(2)
with col1:
title = st.text_input("Paper Title")
authors = st.text_input("Authors (comma-separated)")
with col2:
year = st.text_input("Year")
style = st.selectbox("Citation Style", ["APA", "MLA", "Chicago"])
if st.button("Generate Citation") and title:
with st.spinner("Generating citation..."):
citation = assistant.generate_citation(
{
"title": title,
"authors": [a.strip() for a in authors.split(",")],
"year": year,
},
style,
)
if "error" in citation:
st.error(citation["error"])
else:
st.code(citation["citation"], language="text")
with tabs[6]: # AI Detector
st.header("AI Detector")
text = st.text_area("Enter text to analyze")
if st.button("Detect AI Content") and text:
with st.spinner("Analyzing..."):
result = assistant.detect_ai_content(text)
if "error" in result:
st.error(result["error"])
else:
st.write(result["choices"][0]["message"]["content"])
if __name__ == "__main__":
main()