|
import express from 'express'; |
|
import axios from 'axios'; |
|
import { Readability } from '@mozilla/readability'; |
|
import { JSDOM } from 'jsdom'; |
|
import TurndownService from 'turndown'; |
|
|
|
|
|
const turndownService = new TurndownService({ |
|
headingStyle: 'atx', |
|
codeBlockStyle: 'fenced' |
|
}); |
|
|
|
class WebsiteParser { |
|
async fetchAndParse(url) { |
|
try { |
|
const response = await axios.get(url, { |
|
headers: { |
|
'User-Agent': 'Mozilla/5.0 (compatible; MCPBot/1.0)' |
|
} |
|
}); |
|
|
|
const dom = new JSDOM(response.data, { url }); |
|
const document = dom.window.document; |
|
|
|
const reader = new Readability(document); |
|
const article = reader.parse(); |
|
|
|
if (!article) { |
|
throw new Error('не удалось спарсить страницу'); |
|
} |
|
|
|
const markdown = turndownService.turndown(article.content); |
|
|
|
return { |
|
title: article.title, |
|
content: markdown, |
|
excerpt: article.excerpt, |
|
byline: article.byline, |
|
siteName: article.siteName |
|
}; |
|
} catch (error) { |
|
throw new Error(`ошибка парсинга или получения страницы: ${error.message}`); |
|
} |
|
} |
|
} |
|
|
|
const app = express(); |
|
const PORT = process.env.PORT || 7860; |
|
const parser = new WebsiteParser(); |
|
|
|
app.use(express.json()); |
|
|
|
app.post('/parse', async (req, res) => { |
|
try { |
|
const { url } = req.body; |
|
|
|
if (!url) { |
|
return res.status(400).json({ error: 'необходимо указать URL!' }); |
|
} |
|
|
|
const result = await parser.fetchAndParse(url); |
|
|
|
res.json({ |
|
title: result.title, |
|
content: result.content, |
|
metadata: { |
|
excerpt: result.excerpt, |
|
byline: result.byline, |
|
siteName: result.siteName |
|
} |
|
}); |
|
} catch (error) { |
|
res.status(500).json({ |
|
error: error.message |
|
}); |
|
} |
|
}); |
|
|
|
app.get('/', (req, res) => { |
|
res.send(`curl -X POST https://prolapse-read.hf.space/parse -H "Content-Type: application/json" -d '{"url": "https://habr.com/ru/companies/serverspace/articles/869252/"}'`); |
|
}); |
|
|
|
app.listen(PORT, () => { |
|
console.log(`Server running on port ${PORT}`); |
|
}); |
|
|