SkyNait commited on
Commit
da94345
·
1 Parent(s): 25c35b4

rabbit_mq correct pattern

Browse files
__pycache__/inference_svm_model.cpython-310.pyc CHANGED
Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ
 
__pycache__/mineru_single.cpython-310.pyc CHANGED
Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ
 
__pycache__/topic_extr.cpython-310.pyc CHANGED
Binary files a/__pycache__/topic_extr.cpython-310.pyc and b/__pycache__/topic_extr.cpython-310.pyc differ
 
__pycache__/worker.cpython-310.pyc CHANGED
Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ
 
output/sample_spec_output.md CHANGED
@@ -1,63 +1,29 @@
1
- # Paper 1 and Paper 2: Pure Mathematics
2
-
3
- To support the co-teaching of this qualification with the AS Mathematics qualification, common content has been highlighted in bold..
4
-
5
- ![](/topic-extraction/8ad59648516f3e9564f0e5df8114f87cd48c2fe5f34b15c28c704962b31adc70.jpg)
6
-
7
- ![](/topic-extraction/8116200eb839fa0c6d87bb6e96db29559283cc3d7de7ff3834326012ca2d37e3.jpg)
8
-
9
- ![](/topic-extraction/4cc2bdaa64251411d29493fafb406ad9974260459b247be498e312e29b969a15.jpg)
10
-
11
- ![](/topic-extraction/a06c5ac3695ab4caff0dc2724c9a8a288fefc94cb1b79e370975be31d3869230.jpg)
12
-
13
- ![](/topic-extraction/c5631064f99712df9f9591a603ae00098be039f661264ce67925a18c90e06142.jpg)
14
-
15
- ![](/topic-extraction/88fdba19faed0f761e041fbf040b8cfc57c73bdf36fcd6e32f59f09ac91aeab6.jpg)
16
-
17
- ![](/topic-extraction/a9a669c1c64b92583f2cc72a8216854bb76c90586a0615afdaee9f0d26d120e9.jpg)
18
-
19
- ![](/topic-extraction/6b5c9d3211ba2d7c95de68ed81f03fc32f1aba22d55e8ba53fb4586fdb270426.jpg)
20
-
21
- ![](/topic-extraction/310a9b6f2764de2b165de3343fea3e64ddbd36f8d43c5962dd48730a9e729019.jpg)
22
-
23
- ![](/topic-extraction/4d3fa5997973de85edbf15b31c91f5d1822c5d9698cbb953d1cd9fff04fca369.jpg)
24
-
25
- ![](/topic-extraction/480483c021c62d2499f240729e15a6aae16aa6d3be9aec2c65a16e6dd6b878e5.jpg)
26
-
27
- ![](/topic-extraction/4b2d26dfff554e5c0e2e33968ea3fbae882e9deec5aa2607288ac72f05fbc093.jpg)
28
-
29
- ![](/topic-extraction/de35db590f61b05cf88744cd89789d664a6abb48c94ef6fb2f380404e0b6aa56.jpg)
30
-
31
- ![](/topic-extraction/230e72098ba7930d8338b8c0bc7c184e7129ec59141952e7c57f127655a00164.jpg)
32
-
33
- ![](/topic-extraction/0247eaaab6c95cbc124fa87c44936e2d9963699fd3bc7522596f997029426354.jpg)
34
-
35
- ![](/topic-extraction/e95d6f913ef911a562b5c5c0e336cf6265c90753738fbf1fb5b86a0370573286.jpg)
36
-
37
- ![](/topic-extraction/e75eb0c3ddebeb5cdef32f0a4281f98c0f435792630fd5cf2a60827fed6496ae.jpg)
38
-
39
- ![](/topic-extraction/6db71c2167c71b32503e4025534a9111558ee893c9b94335f73e8d965bdb3e7f.jpg)
40
-
41
  # Paper 3: Statistics and Mechanics
42
-
43
- All the Pure Mathematics content is assumed knowledge for Paper 3 and may be tested in parts of questions.
44
-
45
- To support the co-teaching of this qualification with the AS Mathematics qualification, common content has been highlighted in bold..
46
-
47
- ![](/topic-extraction/bdc8dba766b71c8baa1fa78425fa9b05960de72fa2e3cd58acec0ed9f6a38484.jpg)
48
-
49
- ![](/topic-extraction/8a7e0f0815ec510978f1e4629f452be0f698ae3b2b73fdd0c6cb6d01b73c658d.jpg)
50
-
51
- ![](/topic-extraction/c0f6c78a4393655d252cf16cf91690f5b853c925eef73e15ce9473f6039518e8.jpg)
52
-
53
- ![](/topic-extraction/d8fc74d90978852def7740a09c94949a8b30a37248555561f4997f4d40bad7b1.jpg)
54
-
55
- ![](/topic-extraction/c27edd49d1ff81e5e31321b53fc559bac988181af672cda7fe65fb17e48fd674.jpg)
56
-
57
- ![](/topic-extraction/f82f21d337bc60d0dc797db76b5738144904989fb044160d9fcceaa41651aa33.jpg)
58
-
59
- ![](/topic-extraction/74059e4d980d876dec0451f14e791402349da955dda7308450dccc287bed0147.jpg)
60
-
61
- ![](/topic-extraction/263c0b8a692bad208c16544fd15d1b12c10dae66e88f3067e4c34932af7eebc4.jpg)
62
-
63
- ![](/topic-extraction/80919764b501319dc4a0fd6715bd31192ad14c7090ed0aed89eabef833b7622e.jpg)
 
1
+ # Paper 1 and Paper 2: Pure Mathematics
2
+ ![](/topic-extraction/1735ae4efca00dee8671c97e1a177e90944b62fa7bd8d53c48c040ea4e21f7f8.jpg)
3
+ ![](/topic-extraction/be2d2d250d82e414fcb5a6192dd3c3c5c99a0985140500305afd0de4952e8983.jpg)
4
+ ![](/topic-extraction/f3a5100efb727216b6a16d61ed5f68a2ae2a42f902c5c9e09ceefca899b32eae.jpg)
5
+ ![](/topic-extraction/42e24b0dc9c9f626f7055594c8c020a5fc8f9376dd0dd2900e4d03b47b0536bb.jpg)
6
+ ![](/topic-extraction/597f0eb70cb889f633c2b79a6113adfb1f5ceb35d6158f9628b51c46537d3da1.jpg)
7
+ ![](/topic-extraction/d60705c60460684834c804faaab3244fc0b79e11c83a5c6ffb362609b60ee0e6.jpg)
8
+ ![](/topic-extraction/8e007b4bfc29566954bd98a7c962a35025ce7a3080de9ce0c5b977e5162cc387.jpg)
9
+ ![](/topic-extraction/9d7e1d4fb02d83bf4dd35aae5d71370d068aba8c9a204b694c9e979fd4dfc9ff.jpg)
10
+ ![](/topic-extraction/dfe618c5f808cfd5273efc88db1e5bf9cf31399309b9ab2f134c04e920275714.jpg)
11
+ ![](/topic-extraction/783b544040b0bfe63ccd9603751fe55b601dfef98534c6a472f1aac4b3d9b865.jpg)
12
+ ![](/topic-extraction/a19704ca444f538d073dde0e859f5e8d10db8eaefa08e91f9343613534f7c9a2.jpg)
13
+ ![](/topic-extraction/9a7a89ef29074254be69210493504768fa6c96774ffff91504f7d1c80b600142.jpg)
14
+ ![](/topic-extraction/a9cad5a6804e1c5fca12f87bd43b0601975b4326b3483d66d21396b511aa4ba1.jpg)
15
+ ![](/topic-extraction/86162a5ce5300c0a2be58c1e36bce61c3a08bad6fb1dcaf23632a145676332b8.jpg)
16
+ ![](/topic-extraction/82aa856101fbedd4f456de3e149dfff1e0f6972b8c1e3820f94fc62d4341e9ab.jpg)
17
+ ![](/topic-extraction/700ee930d2b1cc81dd617f33a0461264ead90dc9200963f2e490d2df54b41f14.jpg)
18
+ ![](/topic-extraction/60c29216b5c0a2e19c9328bccf6ce25290446a996ca9cc92fceea58711e03f77.jpg)
19
+ ![](/topic-extraction/1cb3e959a761fcfbcc2a317451cf4742cfc6ee95a0eeb3257375c80c953bf73a.jpg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # Paper 3: Statistics and Mechanics
21
+ ![](/topic-extraction/c30c36fb210f3d9f9f56f823d7e50c8ace40ab25a6c38954fcad19f60663c7f3.jpg)
22
+ ![](/topic-extraction/08748ec00b16344413669fcc0b1665e066bc393de274ac63b2a5e88b4ec782e8.jpg)
23
+ ![](/topic-extraction/2593ea2dfce1f1c421cc420f7d31b52f80d8efaa5bc2f816eeee2e6169eae2ab.jpg)
24
+ ![](/topic-extraction/575f0d9292d45b9e94c5f428eaa48d2de44bad44e75f5b007bb44fdeb764fff5.jpg)
25
+ ![](/topic-extraction/43cb27d3a7aacb5a9a107014d7d4bcf89412901aa4d3fc225d5d86506c3b3c04.jpg)
26
+ ![](/topic-extraction/9c838091270939c1208f3771713b871b10ae59bdc72c432d77d2879ff02f428e.jpg)
27
+ ![](/topic-extraction/6f5b88842305bfa7fded68d32e996d9a2a5099ba33db6cf05532d68dbf6b00fe.jpg)
28
+ ![](/topic-extraction/bdcf58bbb2cb3ae6d0f43be5a5b350a3280b162945726a6295d4d49250149645.jpg)
29
+ ![](/topic-extraction/2ac2cd89e6a9b2288736037873ed54be84dedbb731d9352eb14687e1828f32cd.jpg)
 
 
 
 
 
 
 
 
 
 
 
 
 
topic_extr.py CHANGED
@@ -112,13 +112,10 @@ class TopicExtractionProcessor:
112
  try:
113
  key = input_file.get("key", "")
114
  url = input_file.get("url", "")
115
- page_field = input_file.get("page")
116
-
117
- if not url or not page_field:
118
- raise ValueError("Missing required 'url' or 'page' in input file")
119
 
120
- page_indices = self.parse_page_range(page_field)
121
- logger.info("Processing %s with pages %s", key, page_indices)
122
 
123
  if url.startswith(("http://", "https://")):
124
  response = requests.get(url)
@@ -127,8 +124,12 @@ class TopicExtractionProcessor:
127
  else:
128
  with open(url, "rb") as f:
129
  pdf_bytes = f.read()
 
 
 
 
 
130
 
131
- subset_pdf = self.create_subset_pdf(pdf_bytes, page_indices)
132
 
133
  dataset = PymuDocDataset(subset_pdf)
134
  inference = doc_analyze(
 
112
  try:
113
  key = input_file.get("key", "")
114
  url = input_file.get("url", "")
115
+ pages = input_file.get("page", [])
 
 
 
116
 
117
+ if not url or not pages:
118
+ raise ValueError("Missing required 'url' or 'page' in input file")
119
 
120
  if url.startswith(("http://", "https://")):
121
  response = requests.get(url)
 
124
  else:
125
  with open(url, "rb") as f:
126
  pdf_bytes = f.read()
127
+
128
+ pages = self.parse_page_range(pages)
129
+ logger.info("Processing %s with pages %s", key, pages)
130
+
131
+ subset_pdf = self.create_subset_pdf(pdf_bytes, pages)
132
 
 
133
 
134
  dataset = PymuDocDataset(subset_pdf)
135
  inference = doc_analyze(
worker.py CHANGED
@@ -129,6 +129,7 @@ class RabbitMQWorker:
129
  elif pattern == "topic_extraction":
130
  data = body_dict.get("data")
131
  input_files = data.get("input_files")
 
132
  logger.info("[Worker %s] Found %d file(s) for topic extraction.", thread_id, len(input_files))
133
 
134
  for file in input_files:
 
129
  elif pattern == "topic_extraction":
130
  data = body_dict.get("data")
131
  input_files = data.get("input_files")
132
+ # contexts = []
133
  logger.info("[Worker %s] Found %d file(s) for topic extraction.", thread_id, len(input_files))
134
 
135
  for file in input_files: