victor HF staff commited on
Commit
26240b0
·
0 Parent(s):
Files changed (7) hide show
  1. .gitattributes +35 -0
  2. Dockerfile +20 -0
  3. README.md +10 -0
  4. index.html +36 -0
  5. package.json +21 -0
  6. script.js +42 -0
  7. server.js +32 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Node.js runtime as the base image
2
+ FROM node:14
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /usr/src/app
6
+
7
+ # Copy package.json and package-lock.json to the working directory
8
+ COPY package*.json ./
9
+
10
+ # Install the application dependencies
11
+ RUN npm install
12
+
13
+ # Copy the rest of the application code to the working directory
14
+ COPY . .
15
+
16
+ # Expose the port that the app runs on
17
+ EXPOSE 3000
18
+
19
+ # Define the command to run the application
20
+ CMD ["node", "server.js"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Dom To Semantic Markdown
3
+ emoji: 📚
4
+ colorFrom: gray
5
+ colorTo: blue
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
index.html ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>DOM to Semantic Markdown Converter</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ </head>
9
+ <body>
10
+ <div class="w-dvh grid h-dvh grid-rows-[50px,1fr] overflow-hidden">
11
+ <div class="flex p-2 items-center gap-5 whitespace-nowrap">
12
+ <input type="url" id="url-input" placeholder="Enter URL" class="bg-gray-100 h-8 w-96 rounded-lg px-2" required>
13
+ <label>
14
+ <input type="checkbox" id="extract-main-content"> Extract main content
15
+ </label>
16
+ <label>
17
+ <input type="checkbox" id="refify-urls"> Refify URLs
18
+ </label>
19
+ <label>
20
+ <input type="checkbox" id="enable-table-column-tracking"> Enable table column tracking
21
+ </label>
22
+ <button type="submit" class="bg-black px-4 text-white h-8" form="converter-form">Convert to semantic markdown</button>
23
+ </div>
24
+ <div class="bg-gray-100 p-4 overflow-auto text-sm">
25
+ <pre id="markdown-output" class="whitespace-pre-wrap"></pre>
26
+ </div>
27
+ </div>
28
+
29
+ <form id="converter-form" class="hidden">
30
+ <input type="text" id="website-domain" placeholder="Website domain">
31
+ </form>
32
+
33
+ <script src="node_modules/dom-to-semantic-markdown/dist/browser/bundle.js"></script>
34
+ <script type="module" src="script.js"></script>
35
+ </body>
36
+ </html>
package.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "dom-to-semantic-markdown",
3
+ "version": "1.0.0",
4
+ "description": "--- title: Dom To Semantic Markdown emoji: 📚 colorFrom: gray colorTo: blue sdk: docker pinned: false ---",
5
+ "main": "server.js",
6
+ "scripts": {
7
+ "start": "node server.js",
8
+ "dev": "nodemon server.js",
9
+ "test": "echo \"Error: no test specified\" && exit 1"
10
+ },
11
+ "author": "",
12
+ "license": "ISC",
13
+ "dependencies": {
14
+ "dom-to-semantic-markdown": "^1.0.11",
15
+ "express": "^4.19.2",
16
+ "axios": "^0.21.1"
17
+ },
18
+ "devDependencies": {
19
+ "nodemon": "^2.0.22"
20
+ }
21
+ }
script.js ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const { convertHtmlToMarkdown } = htmlToSMD;
2
+
3
+ document
4
+ .getElementById("converter-form")
5
+ .addEventListener("submit", async function (e) {
6
+ e.preventDefault();
7
+
8
+ const urlInput = document.getElementById("url-input").value;
9
+ const markdownOutput = document.getElementById("markdown-output");
10
+ const extractMainContent = document.getElementById(
11
+ "extract-main-content"
12
+ ).checked;
13
+ const refifyUrls = document.getElementById("refify-urls").checked;
14
+ const enableTableColumnTracking = document.getElementById(
15
+ "enable-table-column-tracking"
16
+ ).checked;
17
+ const websiteDomain = document.getElementById("website-domain").value;
18
+
19
+ const options = {
20
+ extractMainContent,
21
+ refifyUrls,
22
+ enableTableColumnTracking,
23
+ websiteDomain: websiteDomain || undefined,
24
+ };
25
+
26
+ try {
27
+ // Fetch HTML content from the server
28
+ const response = await fetch(
29
+ `/fetch-html?url=${encodeURIComponent(urlInput)}`
30
+ );
31
+ if (!response.ok) {
32
+ throw new Error("Failed to fetch HTML content");
33
+ }
34
+ const htmlContent = await response.text();
35
+
36
+ // Convert HTML to Markdown
37
+ const markdown = await convertHtmlToMarkdown(htmlContent, options);
38
+ markdownOutput.textContent = markdown;
39
+ } catch (error) {
40
+ markdownOutput.textContent = "Error: " + error.message;
41
+ }
42
+ });
server.js ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const express = require("express");
2
+ const path = require("path");
3
+ const axios = require("axios");
4
+ const app = express();
5
+ const port = process.env.PORT || 3000;
6
+
7
+ // Serve static files from the current directory
8
+ app.use(express.static(__dirname));
9
+
10
+ // Route to fetch HTML content from a given URL
11
+ app.get("/fetch-html", async (req, res) => {
12
+ const url = req.query.url;
13
+ if (!url) {
14
+ return res.status(400).json({ error: "URL parameter is required" });
15
+ }
16
+
17
+ try {
18
+ const response = await axios.get(url);
19
+ res.send(response.data);
20
+ } catch (error) {
21
+ res.status(500).json({ error: "Failed to fetch HTML content" });
22
+ }
23
+ });
24
+
25
+ // Send index.html for any other routes
26
+ app.get("*", (req, res) => {
27
+ res.sendFile(path.resolve(__dirname, "index.html"));
28
+ });
29
+
30
+ app.listen(port, () => {
31
+ console.log(`Server is running on port ${port}`);
32
+ });