read / server.js
2ch's picture
Update server.js
101262d verified
import express from 'express';
import axios from 'axios';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import TurndownService from 'turndown';
const turndownService = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced'
});
class WebsiteParser {
async fetchAndParse(url) {
try {
const response = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; MCPBot/1.0)'
}
});
const dom = new JSDOM(response.data, { url });
const document = dom.window.document;
const reader = new Readability(document);
const article = reader.parse();
if (!article) {
throw new Error('не удалось спарсить страницу');
}
const markdown = turndownService.turndown(article.content);
return {
title: article.title,
content: markdown,
excerpt: article.excerpt,
byline: article.byline,
siteName: article.siteName
};
} catch (error) {
throw new Error(`ошибка парсинга или получения страницы: ${error.message}`);
}
}
}
const app = express();
const PORT = process.env.PORT || 7860;
const parser = new WebsiteParser();
app.use(express.json());
app.post('/parse', async (req, res) => {
try {
const { url } = req.body;
if (!url) {
return res.status(400).json({ error: 'необходимо указать URL!' });
}
const result = await parser.fetchAndParse(url);
res.json({
title: result.title,
content: result.content,
metadata: {
excerpt: result.excerpt,
byline: result.byline,
siteName: result.siteName
}
});
} catch (error) {
res.status(500).json({
error: error.message
});
}
});
app.get('/', (req, res) => {
res.send(`curl -X POST https://prolapse-read.hf.space/parse -H "Content-Type: application/json" -d '{"url": "https://habr.com/ru/companies/serverspace/articles/869252/"}'`);
});
app.listen(PORT, () => {
console.log(`Server running on port ${PORT}`);
});