|
""" |
|
1. 只有使用PyPDFLoader(from langchain_community.document_loaders import PyPDFLoader), 才能获得在metadata中获得page的信息。 |
|
|
|
""" |
|
|
|
from langchain_community.vectorstores import FAISS |
|
from langchain.embeddings.huggingface import HuggingFaceEmbeddings |
|
import streamlit as st |
|
import re |
|
from langchain.llms.base import LLM |
|
from langchain.llms.utils import enforce_stop_tokens |
|
from typing import Dict, List, Optional, Tuple, Union |
|
import requests |
|
import json |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_document_name(path): |
|
|
|
path_segments = path.split("/") |
|
|
|
document_name = path_segments[-1] |
|
return document_name |
|
|
|
|
|
import re |
|
def extract_sentence(text): |
|
""" |
|
从一段话中提取 1 句完整的句子,且该句子的长度必须超过 5 个词。 |
|
|
|
Args: |
|
text: 一段话。 |
|
|
|
Returns: |
|
提取到的句子。 |
|
""" |
|
|
|
|
|
text = text.replace('\n\n', '') |
|
|
|
sentences = re.split(r'[。?!;]', text) |
|
|
|
|
|
sentences = [sentence for sentence in sentences if len(sentence.split()) >= 5] |
|
|
|
|
|
return sentences[0] if sentences else None |
|
|
|
|
|
|
|
def rag_source(docs): |
|
print('starting source function!') |
|
print('docs now:', docs) |
|
source = "" |
|
for i, doc in enumerate(docs): |
|
|
|
source += f"**【信息来源 {i+1}】** " + extract_document_name(doc.metadata['source']) + ',' + f"第{docs[i].metadata['page']+1}页" + ',部分内容摘录:' + extract_sentence(doc.page_content) + '\n\n' |
|
|
|
|
|
print('source:', source) |
|
return source |
|
|
|
|
|
|