File size: 4,572 Bytes
b91146d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import streamlit as st
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import os
import json
import re

# 1. Load environment variables
load_dotenv()
MONGODB_URI = os.getenv(
    "MONGODB_UR",
    "mongodb+srv://milind:[email protected]/?retryWrites=true&w=majority&appName=Cluster0",
)
# 2. Create MongoDB connection
client = MongoClient(MONGODB_URI)
db = client["novascholar_db"]
collection = db["research_papers"]


def convert_mixed_columns(df: pd.DataFrame) -> pd.DataFrame:
    """

    Convert any columns that contain lists into comma-separated strings

    to ensure consistent data types for CSV export.

    """
    for col in df.columns:
        if any(isinstance(val, list) for val in df[col].dropna()):
            df[col] = df[col].apply(
                lambda x: (
                    ", ".join(map(str, x))
                    if isinstance(x, list)
                    else (str(x) if pd.notna(x) else "")
                )
            )
    return df


def filter_and_export_collection_to_csv(keywords_list, doc_collection):
    """

    Fetch documents from the specified collection where the 'Keywords' field

    matches ANY of the keywords in 'keywords_list'. Convert to DataFrame,

    ensure consistent column types, save to CSV, and return the DataFrame

    and CSV filename.

    """
    # 3. Retrieve filtered documents from the collection based on 'Keywords' using $in with regex for substring matching
    regex_keywords = [f".*{keyword}.*" for keyword in keywords_list]
    docs = list(
        doc_collection.find(
            {"Keywords": {"$regex": "|".join(regex_keywords), "$options": "i"}}
        )
    )

    # Convert documents to DataFrame
    df = pd.DataFrame(docs)

    if not df.empty:
        # 4. Convert mixed columns
        df = convert_mixed_columns(df)
        # 5. Export to CSV
        csv_filename = "filtered_papers_export.csv"
        df.to_csv(csv_filename, index=False)
        return df, csv_filename
    else:
        # Return an empty DataFrame and None if no documents found
        return pd.DataFrame(), None


def main():
    st.title("Filter and Export Papers by Keyword")

    # Let user select the paper type
    paper_type = st.selectbox(
        "Select type of research paper:",
        [
            "Review Based Paper",
            "Opinion/Perspective Based Paper",
            "Empirical Research Paper",
            "Research Paper (Other)",
        ],
    )

    # Let user enter the keyword to filter
    keyword_input = st.text_input(
        "Enter the exact keyword to filter papers by 'Keywords' field:"
    )

    # When user clicks button, use the collection for the selected paper type
    if st.button("Export Filtered Papers to CSV"):
        with st.spinner("Exporting filtered documents..."):
            try:
                # Determine dynamic collection based on paper type
                collection_name = paper_type.replace(" ", "_").lower()
                doc_collection = db[collection_name]

                # Split keywords by commas and strip whitespace
                keywords_list = [
                    kw.strip() for kw in keyword_input.split(",") if kw.strip()
                ]

                if not keywords_list:
                    st.warning("Please enter at least one keyword.")
                else:
                    df, csv_filename = filter_and_export_collection_to_csv(
                        keywords_list, doc_collection
                    )
                    if not df.empty and csv_filename:
                        st.success(
                            f"Successfully exported filtered papers to {csv_filename}!"
                        )
                        st.download_button(
                            label="Download CSV",
                            data=df.to_csv(index=False).encode("utf-8"),
                            file_name=csv_filename,
                            mime="text/csv",
                        )
                        st.write("Preview of the filtered DataFrame:")
                        st.dataframe(df)
                    else:
                        st.warning(
                            "No matching documents found for the provided keyword(s)."
                        )
            except Exception as e:
                st.error(f"Error exporting filtered papers: {str(e)}")


if __name__ == "__main__":
    main()