Spaces:
Sleeping
Sleeping
Tuchuanhuhuhu
commited on
Commit
·
8c60761
1
Parent(s):
d32517d
清理PDF函数中的llama_index
Browse files- modules/pdf_func.py +7 -7
modules/pdf_func.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
from types import SimpleNamespace
|
2 |
import pdfplumber
|
3 |
import logging
|
4 |
-
from
|
5 |
|
6 |
def prepare_table_config(crop_page):
|
7 |
"""Prepare table查找边界, 要求page为原始page
|
8 |
-
|
9 |
From https://github.com/jsvine/pdfplumber/issues/242
|
10 |
"""
|
11 |
page = crop_page.root_page # root/parent
|
@@ -60,7 +60,7 @@ def get_title_with_cropped_page(first_page):
|
|
60 |
title_bottom = word.bottom
|
61 |
elif word.text == "Abstract": # 获取页面abstract
|
62 |
top = word.top
|
63 |
-
|
64 |
user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
|
65 |
# 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
|
66 |
return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
|
@@ -75,7 +75,7 @@ def get_column_cropped_pages(pages, two_column=True):
|
|
75 |
new_pages.append(right)
|
76 |
else:
|
77 |
new_pages.append(page)
|
78 |
-
|
79 |
return new_pages
|
80 |
|
81 |
def parse_pdf(filename, two_column = True):
|
@@ -94,7 +94,7 @@ def parse_pdf(filename, two_column = True):
|
|
94 |
name_top=name_top,
|
95 |
name_bottom=name_bottom,
|
96 |
record_chapter_name = True,
|
97 |
-
|
98 |
page_start=page_start,
|
99 |
page_stop=None,
|
100 |
|
@@ -114,7 +114,7 @@ def parse_pdf(filename, two_column = True):
|
|
114 |
if word.size >= 11: # 出现chapter name
|
115 |
if cur_chapter is None:
|
116 |
cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
|
117 |
-
elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
|
118 |
# 不再继续写chapter name
|
119 |
cur_chapter.page_stop = page.page_number # stop id
|
120 |
chapters.append(cur_chapter)
|
@@ -143,7 +143,7 @@ def parse_pdf(filename, two_column = True):
|
|
143 |
text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
|
144 |
|
145 |
logging.getLogger().setLevel(level)
|
146 |
-
return Document(
|
147 |
|
148 |
BASE_POINTS = """
|
149 |
1. Who are the authors?
|
|
|
1 |
from types import SimpleNamespace
|
2 |
import pdfplumber
|
3 |
import logging
|
4 |
+
from langchain.docstore.document import Document
|
5 |
|
6 |
def prepare_table_config(crop_page):
|
7 |
"""Prepare table查找边界, 要求page为原始page
|
8 |
+
|
9 |
From https://github.com/jsvine/pdfplumber/issues/242
|
10 |
"""
|
11 |
page = crop_page.root_page # root/parent
|
|
|
60 |
title_bottom = word.bottom
|
61 |
elif word.text == "Abstract": # 获取页面abstract
|
62 |
top = word.top
|
63 |
+
|
64 |
user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
|
65 |
# 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
|
66 |
return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
|
|
|
75 |
new_pages.append(right)
|
76 |
else:
|
77 |
new_pages.append(page)
|
78 |
+
|
79 |
return new_pages
|
80 |
|
81 |
def parse_pdf(filename, two_column = True):
|
|
|
94 |
name_top=name_top,
|
95 |
name_bottom=name_bottom,
|
96 |
record_chapter_name = True,
|
97 |
+
|
98 |
page_start=page_start,
|
99 |
page_stop=None,
|
100 |
|
|
|
114 |
if word.size >= 11: # 出现chapter name
|
115 |
if cur_chapter is None:
|
116 |
cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
|
117 |
+
elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
|
118 |
# 不再继续写chapter name
|
119 |
cur_chapter.page_stop = page.page_number # stop id
|
120 |
chapters.append(cur_chapter)
|
|
|
143 |
text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
|
144 |
|
145 |
logging.getLogger().setLevel(level)
|
146 |
+
return Document(page_content=text, metadata={"title": title})
|
147 |
|
148 |
BASE_POINTS = """
|
149 |
1. Who are the authors?
|