Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -697,49 +697,38 @@ def mindAsk(
|
|
697 |
from bs4 import BeautifulSoup
|
698 |
import requests
|
699 |
|
700 |
-
'''
|
701 |
-
# 定义函数:从网页中抓取文本
|
702 |
-
def scrape_text(url, proxies) -> str:
|
703 |
-
"""从网页抓取文本
|
704 |
-
|
705 |
-
参数:
|
706 |
-
url (str): 要抓取文本的网址
|
707 |
-
|
708 |
-
返回:
|
709 |
-
str: 抓取到的文本
|
710 |
-
"""
|
711 |
-
headers = {
|
712 |
-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
|
713 |
-
'Content-Type': 'text/plain',
|
714 |
-
}
|
715 |
-
try:
|
716 |
-
response = requests.get(url, headers=headers, proxies=proxies, timeout=8)
|
717 |
-
if response.encoding == "ISO-8859-1":
|
718 |
-
response.encoding = response.apparent_encoding
|
719 |
-
except:
|
720 |
-
return "无法连接到该网页"
|
721 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
722 |
-
for script in soup(["script", "style"]):
|
723 |
-
script.extract()
|
724 |
-
text = soup.get_text()
|
725 |
-
lines = (line.strip() for line in text.splitlines())
|
726 |
-
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
727 |
-
text = "\n".join(chunk for chunk in chunks if chunk)
|
728 |
-
return text
|
729 |
-
'''
|
730 |
-
# 修改函数:从网页中抓取文本,限制为前500个字符
|
731 |
# 导入所需模块
|
732 |
|
|
|
|
|
733 |
|
734 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
735 |
def scrape_text(url, proxies) -> str:
|
736 |
-
"""
|
737 |
|
738 |
参数:
|
739 |
url (str): 要抓取文本的网址
|
740 |
|
741 |
返回:
|
742 |
-
str:
|
743 |
"""
|
744 |
headers = {
|
745 |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
|
@@ -756,23 +745,37 @@ def scrape_text(url, proxies) -> str:
|
|
756 |
script.extract()
|
757 |
text = soup.get_text()
|
758 |
|
759 |
-
#
|
760 |
-
text = text[:
|
761 |
|
762 |
-
#
|
763 |
lines = text.split('\n')
|
764 |
selected_lines = []
|
|
|
|
|
765 |
for line in lines:
|
766 |
# 去除多余的空格
|
767 |
cleaned_line = ' '.join(line.split())
|
768 |
-
|
769 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
770 |
|
771 |
# 拼接选中的行
|
772 |
selected_text = '\n'.join(selected_lines)
|
773 |
|
774 |
return selected_text
|
775 |
|
|
|
776 |
#ggins1="请围用户搜索主题,对搜索结果进行全面的总结。\n用户搜索主题:"
|
777 |
txtSumins1 = """
|
778 |
将以下文字进行概括,以便于阅读和理解。 摘要要简明扼要,抓住课文要点,让二年级学生看得懂。 避免使用复杂的句子结构或技术术语。 你的回答应该是中文。
|
@@ -786,10 +789,15 @@ def GGSearch(
|
|
786 |
|
787 |
# 使用用户输入进行联网搜索
|
788 |
url = f"https://www.google.com/search?q={user_input}"
|
|
|
|
|
789 |
headers = {
|
790 |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
|
791 |
}
|
792 |
-
|
|
|
|
|
|
|
793 |
try:
|
794 |
response = requests.get(url, headers=headers)
|
795 |
soup = BeautifulSoup(response.content, 'html.parser')
|
@@ -819,6 +827,7 @@ def GGSearch(
|
|
819 |
GGSearchins = "\n".join(scraped_text for scraped_text in scraped_texts)
|
820 |
|
821 |
# 更新聊天界面和历史记录
|
|
|
822 |
chatbot.append(("联网搜索结果:", GGSearchins))
|
823 |
history.append(("联网搜索结果:", GGSearchins))
|
824 |
|
|
|
697 |
from bs4 import BeautifulSoup
|
698 |
import requests
|
699 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
700 |
# 导入所需模块
|
701 |
|
702 |
+
from bs4 import BeautifulSoup
|
703 |
+
import requests
|
704 |
|
705 |
+
keywords_to_check = [
|
706 |
+
"首页推荐",
|
707 |
+
"业务合作",
|
708 |
+
"ICP备",
|
709 |
+
"公网安备",
|
710 |
+
"经营许可证",
|
711 |
+
"网络文化许可证",
|
712 |
+
"互联网宗教信息服务许可证",
|
713 |
+
"服务备案",
|
714 |
+
"信息备",
|
715 |
+
"谣言曝光台",
|
716 |
+
"不良信息举报",
|
717 |
+
"举报",
|
718 |
+
"广告投放",
|
719 |
+
"营业执照",
|
720 |
+
]
|
721 |
+
|
722 |
+
|
723 |
+
# 定义函数:从网页中抓取文本并保留每个 URL 最多的前三行内容
|
724 |
def scrape_text(url, proxies) -> str:
|
725 |
+
"""从网页抓取文本,保留每个 URL 最多的前三行内容
|
726 |
|
727 |
参数:
|
728 |
url (str): 要抓取文本的网址
|
729 |
|
730 |
返回:
|
731 |
+
str: 抓取到的文本,每个 URL 最多保留前三行内容
|
732 |
"""
|
733 |
headers = {
|
734 |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
|
|
|
745 |
script.extract()
|
746 |
text = soup.get_text()
|
747 |
|
748 |
+
# 截取文本,保留最多500个字符
|
749 |
+
text = text[:777]
|
750 |
|
751 |
+
# 按行划分文本
|
752 |
lines = text.split('\n')
|
753 |
selected_lines = []
|
754 |
+
max_lines = 3 # 最多保留的行数
|
755 |
+
max_line_length = 0 # 最长的行字符数
|
756 |
for line in lines:
|
757 |
# 去除多余的空格
|
758 |
cleaned_line = ' '.join(line.split())
|
759 |
+
line_length = len(cleaned_line)
|
760 |
+
if line_length >= 37:
|
761 |
+
if not any(keyword in cleaned_line for keyword in keywords_to_check):
|
762 |
+
# 如果不包含关键词,保留这一行
|
763 |
+
selected_lines.append(cleaned_line)
|
764 |
+
# 如果行字符数大于等于10,才考虑保留
|
765 |
+
if line_length > max_line_length:
|
766 |
+
# 如果当前行字符数比最长行字符数还大,更新最长行字符数和选中行列表
|
767 |
+
max_line_length = line_length
|
768 |
+
selected_lines = [cleaned_line]
|
769 |
+
elif len(selected_lines) < max_lines:
|
770 |
+
# 如果选中行列表还没有达到最大行数,直接添加当前行
|
771 |
+
selected_lines.append(cleaned_line)
|
772 |
|
773 |
# 拼接选中的行
|
774 |
selected_text = '\n'.join(selected_lines)
|
775 |
|
776 |
return selected_text
|
777 |
|
778 |
+
|
779 |
#ggins1="请围用户搜索主题,对搜索结果进行全面的总结。\n用户搜索主题:"
|
780 |
txtSumins1 = """
|
781 |
将以下文字进行概括,以便于阅读和理解。 摘要要简明扼要,抓住课文要点,让二年级学生看得懂。 避免使用复杂的句子结构或技术术语。 你的回答应该是中文。
|
|
|
789 |
|
790 |
# 使用用户输入进行联网搜索
|
791 |
url = f"https://www.google.com/search?q={user_input}"
|
792 |
+
#url = f"https://www.sogou.com/web?query={user_input}"
|
793 |
+
'''
|
794 |
headers = {
|
795 |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
|
796 |
}
|
797 |
+
'''
|
798 |
+
headers = {
|
799 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44"
|
800 |
+
}
|
801 |
try:
|
802 |
response = requests.get(url, headers=headers)
|
803 |
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
827 |
GGSearchins = "\n".join(scraped_text for scraped_text in scraped_texts)
|
828 |
|
829 |
# 更新聊天界面和历史记录
|
830 |
+
|
831 |
chatbot.append(("联网搜索结果:", GGSearchins))
|
832 |
history.append(("联网搜索结果:", GGSearchins))
|
833 |
|