Tuchuanhuhuhu commited on
Commit
8c60761
·
1 Parent(s): d32517d

清理PDF函数中的llama_index

Browse files
Files changed (1) hide show
  1. modules/pdf_func.py +7 -7
modules/pdf_func.py CHANGED
@@ -1,11 +1,11 @@
1
  from types import SimpleNamespace
2
  import pdfplumber
3
  import logging
4
- from llama_index import Document
5
 
6
  def prepare_table_config(crop_page):
7
  """Prepare table查找边界, 要求page为原始page
8
-
9
  From https://github.com/jsvine/pdfplumber/issues/242
10
  """
11
  page = crop_page.root_page # root/parent
@@ -60,7 +60,7 @@ def get_title_with_cropped_page(first_page):
60
  title_bottom = word.bottom
61
  elif word.text == "Abstract": # 获取页面abstract
62
  top = word.top
63
-
64
  user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
65
  # 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
66
  return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
@@ -75,7 +75,7 @@ def get_column_cropped_pages(pages, two_column=True):
75
  new_pages.append(right)
76
  else:
77
  new_pages.append(page)
78
-
79
  return new_pages
80
 
81
  def parse_pdf(filename, two_column = True):
@@ -94,7 +94,7 @@ def parse_pdf(filename, two_column = True):
94
  name_top=name_top,
95
  name_bottom=name_bottom,
96
  record_chapter_name = True,
97
-
98
  page_start=page_start,
99
  page_stop=None,
100
 
@@ -114,7 +114,7 @@ def parse_pdf(filename, two_column = True):
114
  if word.size >= 11: # 出现chapter name
115
  if cur_chapter is None:
116
  cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
117
- elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
118
  # 不再继续写chapter name
119
  cur_chapter.page_stop = page.page_number # stop id
120
  chapters.append(cur_chapter)
@@ -143,7 +143,7 @@ def parse_pdf(filename, two_column = True):
143
  text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
144
 
145
  logging.getLogger().setLevel(level)
146
- return Document(text=text, extra_info={"title": title})
147
 
148
  BASE_POINTS = """
149
  1. Who are the authors?
 
1
  from types import SimpleNamespace
2
  import pdfplumber
3
  import logging
4
+ from langchain.docstore.document import Document
5
 
6
  def prepare_table_config(crop_page):
7
  """Prepare table查找边界, 要求page为原始page
8
+
9
  From https://github.com/jsvine/pdfplumber/issues/242
10
  """
11
  page = crop_page.root_page # root/parent
 
60
  title_bottom = word.bottom
61
  elif word.text == "Abstract": # 获取页面abstract
62
  top = word.top
63
+
64
  user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
65
  # 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
66
  return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
 
75
  new_pages.append(right)
76
  else:
77
  new_pages.append(page)
78
+
79
  return new_pages
80
 
81
  def parse_pdf(filename, two_column = True):
 
94
  name_top=name_top,
95
  name_bottom=name_bottom,
96
  record_chapter_name = True,
97
+
98
  page_start=page_start,
99
  page_stop=None,
100
 
 
114
  if word.size >= 11: # 出现chapter name
115
  if cur_chapter is None:
116
  cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
117
+ elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
118
  # 不再继续写chapter name
119
  cur_chapter.page_stop = page.page_number # stop id
120
  chapters.append(cur_chapter)
 
143
  text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
144
 
145
  logging.getLogger().setLevel(level)
146
+ return Document(page_content=text, metadata={"title": title})
147
 
148
  BASE_POINTS = """
149
  1. Who are the authors?