mgbam commited on
Commit
f3774c1
·
verified ·
1 Parent(s): b9d5274

Update pubmed_utils.py

Browse files
Files changed (1) hide show
  1. pubmed_utils.py +45 -53
pubmed_utils.py CHANGED
@@ -1,102 +1,94 @@
1
  import requests
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
3
  import nltk
4
- nltk.download('punkt')
5
  from nltk.tokenize import sent_tokenize
 
6
 
7
- from transformers import pipeline
8
- from config import MY_PUBMED_EMAIL
9
-
10
- # Summarization pipeline for PubMed abstracts
11
  summarizer = pipeline(
12
  "summarization",
13
  model="facebook/bart-large-cnn",
14
  tokenizer="facebook/bart-large-cnn"
15
  )
16
 
17
- def search_pubmed(query, max_results=3):
 
18
  """
19
- Searches PubMed via ESearch. Returns list of PMIDs.
20
  """
21
- base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
22
  params = {
23
  "db": "pubmed",
24
  "term": query,
25
  "retmax": max_results,
26
  "retmode": "json",
27
- "tool": "ElysiumRAG",
28
- "email": MY_PUBMED_EMAIL
29
  }
30
- resp = requests.get(base_url, params=params)
31
- resp.raise_for_status()
32
- data = resp.json()
33
  return data.get("esearchresult", {}).get("idlist", [])
34
 
35
- def fetch_one_abstract(pmid):
 
36
  """
37
- Fetches a single abstract for the given PMID.
38
  """
39
- base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
40
  params = {
41
  "db": "pubmed",
 
42
  "retmode": "text",
43
  "rettype": "abstract",
44
- "id": pmid,
45
- "tool": "ElysiumRAG",
46
- "email": MY_PUBMED_EMAIL
47
  }
48
- resp = requests.get(base_url, params=params)
49
- resp.raise_for_status()
50
- raw_text = resp.text.strip() or "No abstract text found."
51
- return (pmid, raw_text)
52
 
53
  def fetch_pubmed_abstracts(pmids):
54
  """
55
- Parallel fetching of multiple abstracts.
56
  """
57
- if not pmids:
58
- return {}
59
- results_map = {}
60
- with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor:
61
- future_to_pmid = {executor.submit(fetch_one_abstract, pmid): pmid for pmid in pmids}
62
  for future in as_completed(future_to_pmid):
63
  pmid = future_to_pmid[future]
64
  try:
65
- pmid_result, text = future.result()
66
- results_map[pmid_result] = text
67
  except Exception as e:
68
- results_map[pmid] = f"Error: {str(e)}"
69
- return results_map
 
70
 
71
- def chunk_and_summarize(abstract_text, chunk_size=512):
72
  """
73
- Splits large abstracts by sentences, summarizes each chunk, then concatenates.
74
  """
75
- sentences = sent_tokenize(abstract_text)
76
  chunks = []
77
-
78
  current_chunk = []
79
  current_length = 0
80
- for sent in sentences:
81
- tokens_in_sent = len(sent.split())
82
- if current_length + tokens_in_sent > chunk_size:
83
  chunks.append(" ".join(current_chunk))
84
  current_chunk = []
85
  current_length = 0
86
- current_chunk.append(sent)
87
- current_length += tokens_in_sent
88
 
89
  if current_chunk:
90
  chunks.append(" ".join(current_chunk))
91
 
92
- summarized_pieces = []
93
- for c in chunks:
94
- summary_out = summarizer(
95
- c,
96
- max_length=100,
97
- min_length=30,
98
- do_sample=False
99
- )
100
- summarized_pieces.append(summary_out[0]['summary_text'])
101
-
102
- return " ".join(summarized_pieces).strip()
 
1
  import requests
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from transformers import pipeline
4
  import nltk
5
+ nltk.download("punkt")
6
  from nltk.tokenize import sent_tokenize
7
+ from config import PUBMED_EMAIL
8
 
9
+ # Summarization model
 
 
 
10
  summarizer = pipeline(
11
  "summarization",
12
  model="facebook/bart-large-cnn",
13
  tokenizer="facebook/bart-large-cnn"
14
  )
15
 
16
+
17
+ def search_pubmed(query, max_results=5):
18
  """
19
+ Searches PubMed for articles matching the query. Returns a list of PMIDs.
20
  """
21
+ url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
22
  params = {
23
  "db": "pubmed",
24
  "term": query,
25
  "retmax": max_results,
26
  "retmode": "json",
27
+ "tool": "MedicalAI",
28
+ "email": PUBMED_EMAIL,
29
  }
30
+ response = requests.get(url, params=params)
31
+ response.raise_for_status()
32
+ data = response.json()
33
  return data.get("esearchresult", {}).get("idlist", [])
34
 
35
+
36
+ def fetch_abstract(pmid):
37
  """
38
+ Fetches an abstract for a given PubMed ID (PMID).
39
  """
40
+ url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
41
  params = {
42
  "db": "pubmed",
43
+ "id": pmid,
44
  "retmode": "text",
45
  "rettype": "abstract",
46
+ "tool": "MedicalAI",
47
+ "email": PUBMED_EMAIL,
 
48
  }
49
+ response = requests.get(url, params=params)
50
+ response.raise_for_status()
51
+ return response.text.strip()
52
+
53
 
54
  def fetch_pubmed_abstracts(pmids):
55
  """
56
+ Fetch multiple PubMed abstracts in parallel.
57
  """
58
+ results = {}
59
+ with ThreadPoolExecutor(max_workers=5) as executor:
60
+ future_to_pmid = {executor.submit(fetch_abstract, pmid): pmid for pmid in pmids}
 
 
61
  for future in as_completed(future_to_pmid):
62
  pmid = future_to_pmid[future]
63
  try:
64
+ results[pmid] = future.result()
 
65
  except Exception as e:
66
+ results[pmid] = f"Error fetching PMID {pmid}: {str(e)}"
67
+ return results
68
+
69
 
70
+ def summarize_text(text, chunk_size=512):
71
  """
72
+ Summarizes long text by splitting it into chunks and processing each chunk.
73
  """
74
+ sentences = sent_tokenize(text)
75
  chunks = []
 
76
  current_chunk = []
77
  current_length = 0
78
+
79
+ for sentence in sentences:
80
+ if current_length + len(sentence.split()) > chunk_size:
81
  chunks.append(" ".join(current_chunk))
82
  current_chunk = []
83
  current_length = 0
84
+ current_chunk.append(sentence)
85
+ current_length += len(sentence.split())
86
 
87
  if current_chunk:
88
  chunks.append(" ".join(current_chunk))
89
 
90
+ summaries = [
91
+ summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
92
+ for chunk in chunks
93
+ ]
94
+ return " ".join(summaries)