tahirsher commited on
Commit
e4df02d
·
verified ·
1 Parent(s): b6b28b4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from rank_bm25 import BM25Okapi
3
+ from transformers import pipeline
4
+ import streamlit as st
5
+ from datasets import load_dataset
6
+
7
+ # Load Dataset from Hugging Face with Error Handling
8
+ def load_huggingface_dataset(dataset_name, config=None, split="train"):
9
+ try:
10
+ if config:
11
+ dataset = load_dataset(dataset_name, config, split=split)
12
+ else:
13
+ dataset = load_dataset(dataset_name, split=split)
14
+
15
+ data = pd.DataFrame(dataset) # Convert to pandas DataFrame
16
+ return data
17
+ except Exception as e:
18
+ st.error(f"Failed to load dataset '{dataset_name}' with config '{config}'. Please try 'lex_glue' or 'eurlex' with appropriate config.")
19
+ st.error(f"Error details: {e}")
20
+ return None
21
+
22
+ # Prepare the Retrieval Model (BM25)
23
+ def prepare_bm25(corpus):
24
+ tokenized_corpus = [doc.split(" ") for doc in corpus]
25
+ bm25 = BM25Okapi(tokenized_corpus)
26
+ return bm25
27
+
28
+ # Search for Similar Documents
29
+ def search_documents(bm25, query, corpus, top_n=5):
30
+ tokenized_query = query.split(" ")
31
+ scores = bm25.get_top_n(tokenized_query, corpus, n=top_n)
32
+ return scores
33
+
34
+ # Summarization Model
35
+ def summarize_text(text):
36
+ try:
37
+ # Use a public model for summarization
38
+ summarizer = pipeline("summarization", model="t5-base") # Change to a public model
39
+ summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
40
+ return summary[0]['summary_text']
41
+ except Exception as e:
42
+ st.error(f"Error in summarization: {e}")
43
+ return "Summary could not be generated."
44
+
45
+ # Streamlit App
46
+ def main():
47
+ st.title("Legal Case Summarizer")
48
+
49
+ # Dataset Selection
50
+ dataset_name = st.selectbox("Choose Hugging Face dataset", ["lex_glue", "eurlex"])
51
+ config = None
52
+
53
+ # Config Selection for lex_glue
54
+ if dataset_name == "lex_glue":
55
+ config = st.selectbox("Select config for lex_glue", ["case_hold", "ecthr_a", "ecthr_b", "eurlex", "ledgar", "scotus", "unfair_tos"])
56
+
57
+ split = st.selectbox("Choose dataset split", ["train", "validation", "test"])
58
+
59
+ if dataset_name:
60
+ st.write("Loading dataset from Hugging Face...")
61
+ data = load_huggingface_dataset(dataset_name, config=config, split=split)
62
+
63
+ if data is not None:
64
+ corpus = data['text'].tolist() if 'text' in data.columns else data.iloc[:, 0].tolist()
65
+ titles = data['title'].tolist() if 'title' in data.columns else ["Title " + str(i) for i in range(len(corpus))]
66
+
67
+ # Prepare BM25 Model
68
+ bm25 = prepare_bm25(corpus)
69
+
70
+ # User Input
71
+ query = st.text_input("Enter keywords for case search:")
72
+ num_results = st.slider("Number of results to display", 1, 10, 5)
73
+
74
+ if query:
75
+ st.write("Searching for relevant cases...")
76
+ results = search_documents(bm25, query, corpus, top_n=num_results)
77
+
78
+ for idx, result in enumerate(results):
79
+ st.write(f"### Case {idx+1}: {titles[corpus.index(result)]}")
80
+ st.write(result)
81
+
82
+ # Summarize the case
83
+ st.write("Summary:")
84
+ summary = summarize_text(result)
85
+ st.write(summary)
86
+
87
+ if __name__ == "__main__":
88
+ main()