mgbam commited on
Commit
3e78ff5
·
verified ·
1 Parent(s): 65e3525

Update pubmed_utils.py

Browse files
Files changed (1) hide show
  1. pubmed_utils.py +14 -24
pubmed_utils.py CHANGED
@@ -1,22 +1,15 @@
1
  import requests
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
  from transformers import pipeline
4
- import nltk
5
- nltk.download("punkt")
6
- from nltk.tokenize import sent_tokenize
7
- from config import PUBMED_EMAIL
8
 
9
- # Summarization model
10
- summarizer = pipeline(
11
- "summarization",
12
- model="facebook/bart-large-cnn",
13
- tokenizer="facebook/bart-large-cnn"
14
- )
15
 
16
 
17
  def search_pubmed(query, max_results=5):
18
  """
19
- Searches PubMed for articles matching the query. Returns a list of PMIDs.
20
  """
21
  url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
22
  params = {
@@ -29,13 +22,12 @@ def search_pubmed(query, max_results=5):
29
  }
30
  response = requests.get(url, params=params)
31
  response.raise_for_status()
32
- data = response.json()
33
- return data.get("esearchresult", {}).get("idlist", [])
34
 
35
 
36
  def fetch_abstract(pmid):
37
  """
38
- Fetches an abstract for a given PubMed ID (PMID).
39
  """
40
  url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
41
  params = {
@@ -53,7 +45,7 @@ def fetch_abstract(pmid):
53
 
54
  def fetch_pubmed_abstracts(pmids):
55
  """
56
- Fetch multiple PubMed abstracts in parallel.
57
  """
58
  results = {}
59
  with ThreadPoolExecutor(max_workers=5) as executor:
@@ -67,28 +59,26 @@ def fetch_pubmed_abstracts(pmids):
67
  return results
68
 
69
 
70
- def summarize_text(text, chunk_size=512):
71
  """
72
- Summarizes long text by splitting it into chunks and processing each chunk.
73
  """
74
- sentences = sent_tokenize(text)
75
  chunks = []
76
  current_chunk = []
77
  current_length = 0
78
 
79
  for sentence in sentences:
80
- if current_length + len(sentence.split()) > chunk_size:
 
81
  chunks.append(" ".join(current_chunk))
82
  current_chunk = []
83
  current_length = 0
84
  current_chunk.append(sentence)
85
- current_length += len(sentence.split())
86
 
87
  if current_chunk:
88
  chunks.append(" ".join(current_chunk))
89
 
90
- summaries = [
91
- summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
92
- for chunk in chunks
93
- ]
94
  return " ".join(summaries)
 
1
  import requests
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
  from transformers import pipeline
4
+ from config import PUBMED_EMAIL, CHUNK_SIZE
 
 
 
5
 
6
+ # Summarization pipeline
7
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
 
 
 
8
 
9
 
10
  def search_pubmed(query, max_results=5):
11
  """
12
+ Search PubMed for PMIDs matching the query.
13
  """
14
  url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
15
  params = {
 
22
  }
23
  response = requests.get(url, params=params)
24
  response.raise_for_status()
25
+ return response.json().get("esearchresult", {}).get("idlist", [])
 
26
 
27
 
28
  def fetch_abstract(pmid):
29
  """
30
+ Fetch abstract for a given PubMed ID.
31
  """
32
  url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
33
  params = {
 
45
 
46
  def fetch_pubmed_abstracts(pmids):
47
  """
48
+ Fetch multiple PubMed abstracts concurrently.
49
  """
50
  results = {}
51
  with ThreadPoolExecutor(max_workers=5) as executor:
 
59
  return results
60
 
61
 
62
+ def summarize_text(text, chunk_size=CHUNK_SIZE):
63
  """
64
+ Summarize long text using a chunking strategy.
65
  """
66
+ sentences = text.split(". ")
67
  chunks = []
68
  current_chunk = []
69
  current_length = 0
70
 
71
  for sentence in sentences:
72
+ tokens = len(sentence.split())
73
+ if current_length + tokens > chunk_size:
74
  chunks.append(" ".join(current_chunk))
75
  current_chunk = []
76
  current_length = 0
77
  current_chunk.append(sentence)
78
+ current_length += tokens
79
 
80
  if current_chunk:
81
  chunks.append(" ".join(current_chunk))
82
 
83
+ summaries = [summarizer(chunk, max_length=100, min_length=30)[0]["summary_text"] for chunk in chunks]
 
 
 
84
  return " ".join(summaries)