mgbam commited on
Commit
455290d
·
verified ·
1 Parent(s): efc6213

Update pubmed_utils.py

Browse files
Files changed (1) hide show
  1. pubmed_utils.py +102 -96
pubmed_utils.py CHANGED
@@ -1,96 +1,102 @@
1
- import requests
2
- from concurrent.futures import ThreadPoolExecutor, as_completed
3
- import nltk
4
- nltk.download('punkt')
5
- from nltk.tokenize import sent_tokenize
6
-
7
- from transformers import pipeline
8
- from config import MY_PUBMED_EMAIL
9
-
10
- # Build a summarization pipeline at module load (caching recommended)
11
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
12
-
13
- def search_pubmed(query, max_results=3):
14
- """
15
- Searches PubMed via ESearch and returns list of PMIDs.
16
- """
17
- base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
18
- params = {
19
- "db": "pubmed",
20
- "term": query,
21
- "retmax": max_results,
22
- "retmode": "json",
23
- "tool": "ElysiumRAG",
24
- "email": MY_PUBMED_EMAIL
25
- }
26
- resp = requests.get(base_url, params=params)
27
- resp.raise_for_status()
28
- data = resp.json()
29
- return data.get("esearchresult", {}).get("idlist", [])
30
-
31
- def fetch_one_abstract(pmid):
32
- """
33
- Fetches abstract for a given PMID. Returns (pmid, text).
34
- """
35
- base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
36
- params = {
37
- "db": "pubmed",
38
- "retmode": "text",
39
- "rettype": "abstract",
40
- "id": pmid,
41
- "tool": "ElysiumRAG",
42
- "email": MY_PUBMED_EMAIL
43
- }
44
- resp = requests.get(base_url, params=params)
45
- resp.raise_for_status()
46
- raw_text = resp.text.strip() or "No abstract text found."
47
- return (pmid, raw_text)
48
-
49
- def fetch_pubmed_abstracts(pmids):
50
- """
51
- Parallel retrieval of multiple PMIDs.
52
- """
53
- if not pmids:
54
- return {}
55
-
56
- results_map = {}
57
- with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor:
58
- future_to_pmid = {executor.submit(fetch_one_abstract, pmid): pmid for pmid in pmids}
59
- for future in as_completed(future_to_pmid):
60
- pmid = future_to_pmid[future]
61
- try:
62
- pmid_result, text = future.result()
63
- results_map[pmid_result] = text
64
- except Exception as e:
65
- results_map[pmid] = f"Error: {str(e)}"
66
- return results_map
67
-
68
- def chunk_and_summarize(abstract_text, chunk_size=512):
69
- """
70
- Chunk large abstracts by sentence, summarize each chunk, then combine.
71
- """
72
- sentences = sent_tokenize(abstract_text)
73
- chunks = []
74
-
75
- current_chunk = []
76
- current_length = 0
77
- for sent in sentences:
78
- tokens_in_sent = len(sent.split())
79
- if current_length + tokens_in_sent > chunk_size:
80
- chunks.append(" ".join(current_chunk))
81
- current_chunk = []
82
- current_length = 0
83
- current_chunk.append(sent)
84
- current_length += tokens_in_sent
85
-
86
- if current_chunk:
87
- chunks.append(" ".join(current_chunk))
88
-
89
- summarized_pieces = []
90
- for c in chunks:
91
- summary_out = summarizer(
92
- c, max_length=100, min_length=30, do_sample=False
93
- )
94
- summarized_pieces.append(summary_out[0]['summary_text'])
95
-
96
- return " ".join(summarized_pieces).strip()
 
 
 
 
 
 
 
1
+ import requests
2
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ import nltk
4
+ nltk.download('punkt')
5
+ from nltk.tokenize import sent_tokenize
6
+
7
+ from transformers import pipeline
8
+ from config import MY_PUBMED_EMAIL
9
+
10
+ # Summarization pipeline for PubMed abstracts
11
+ summarizer = pipeline(
12
+ "summarization",
13
+ model="facebook/bart-large-cnn",
14
+ tokenizer="facebook/bart-large-cnn"
15
+ )
16
+
17
+ def search_pubmed(query, max_results=3):
18
+ """
19
+ Searches PubMed via ESearch. Returns list of PMIDs.
20
+ """
21
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
22
+ params = {
23
+ "db": "pubmed",
24
+ "term": query,
25
+ "retmax": max_results,
26
+ "retmode": "json",
27
+ "tool": "ElysiumRAG",
28
+ "email": MY_PUBMED_EMAIL
29
+ }
30
+ resp = requests.get(base_url, params=params)
31
+ resp.raise_for_status()
32
+ data = resp.json()
33
+ return data.get("esearchresult", {}).get("idlist", [])
34
+
35
+ def fetch_one_abstract(pmid):
36
+ """
37
+ Fetches a single abstract for the given PMID.
38
+ """
39
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
40
+ params = {
41
+ "db": "pubmed",
42
+ "retmode": "text",
43
+ "rettype": "abstract",
44
+ "id": pmid,
45
+ "tool": "ElysiumRAG",
46
+ "email": MY_PUBMED_EMAIL
47
+ }
48
+ resp = requests.get(base_url, params=params)
49
+ resp.raise_for_status()
50
+ raw_text = resp.text.strip() or "No abstract text found."
51
+ return (pmid, raw_text)
52
+
53
+ def fetch_pubmed_abstracts(pmids):
54
+ """
55
+ Parallel fetching of multiple abstracts.
56
+ """
57
+ if not pmids:
58
+ return {}
59
+ results_map = {}
60
+ with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor:
61
+ future_to_pmid = {executor.submit(fetch_one_abstract, pmid): pmid for pmid in pmids}
62
+ for future in as_completed(future_to_pmid):
63
+ pmid = future_to_pmid[future]
64
+ try:
65
+ pmid_result, text = future.result()
66
+ results_map[pmid_result] = text
67
+ except Exception as e:
68
+ results_map[pmid] = f"Error: {str(e)}"
69
+ return results_map
70
+
71
+ def chunk_and_summarize(abstract_text, chunk_size=512):
72
+ """
73
+ Splits large abstracts by sentences, summarizes each chunk, then concatenates.
74
+ """
75
+ sentences = sent_tokenize(abstract_text)
76
+ chunks = []
77
+
78
+ current_chunk = []
79
+ current_length = 0
80
+ for sent in sentences:
81
+ tokens_in_sent = len(sent.split())
82
+ if current_length + tokens_in_sent > chunk_size:
83
+ chunks.append(" ".join(current_chunk))
84
+ current_chunk = []
85
+ current_length = 0
86
+ current_chunk.append(sent)
87
+ current_length += tokens_in_sent
88
+
89
+ if current_chunk:
90
+ chunks.append(" ".join(current_chunk))
91
+
92
+ summarized_pieces = []
93
+ for c in chunks:
94
+ summary_out = summarizer(
95
+ c,
96
+ max_length=100,
97
+ min_length=30,
98
+ do_sample=False
99
+ )
100
+ summarized_pieces.append(summary_out[0]['summary_text'])
101
+
102
+ return " ".join(summarized_pieces).strip()