danial0203 commited on
Commit
0bad17c
1 Parent(s): 1cbb4c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -68
app.py CHANGED
@@ -54,76 +54,76 @@ def process_files_fixed(image_path, page_identifier, error_pages):
54
  error_pages.append(page_identifier)
55
  return []
56
 
57
- prompt = """**Objective:** Extract specific data from a table within an image using OCR.
58
-
59
- **Image Description:** The image contains a table with student information.
60
-
61
- **Columns of Interest:**
62
-
63
- * S.No (Serial Number)
64
- * Admission No.
65
- * Date of Admission
66
- * Name of Student
67
- * Father's Name
68
- * Date of Birth
69
- * Telephone No.
70
- * Address
71
- * F.CNIC (Father's CNIC)
72
- * S.CNIC (Student's CNIC) - Located under the "REMARKS" column
73
- * M.Name (Mother's Name) - Located under the "REMARKS" column
74
-
75
- **Instructions:**
76
-
77
- 1. **Perform OCR:** Use Optical Character Recognition to extract text from the image.
78
- 2. **Table Detection:** Identify the table within the image.
79
- 3. **Column Identification:**
80
- * If table headers are present and visible, use them to identify the columns of interest.
81
- * If headers are missing or unclear, assume the order of columns as specified above.
82
- 4. **Data Extraction:**
83
- * Extract data from each row of the table for the specified columns only.
84
- * Disregard any additional columns present in the table.
85
- * **Important:** Extract data from all rows, do not skip any rows.
86
- * For "Telephone No.", focus on the number itself and ignore any labels like "office" or "residence" associated with it.
87
- * For "F.CNIC", "S.CNIC", and "M.Name", extract this information from the "REMARKS" column.
88
- 5. **Data Verification:**
89
- * Implement checks to ensure the accuracy of extracted data, especially for numerical values like "S.No" and "Telephone No."
90
- * Consider using checksums or validation rules based on known formats (e.g., CNIC format).
91
-
92
- **Output Format:**
93
-
94
- ```json
 
 
95
  {
96
- "data": [
97
- {
98
- "S_No": "1",
99
- "Admission No.": "1604",
100
- "Date of Admission": "25-4-17",
101
- "Name of Student": "Maham Tariq",
102
- "Father's Name": "Tariq Mehman",
103
- "Date of Birth": "12-05-12",
104
- "Telephone No.": "03125350838",
105
- "Address": "Dewan-e-umar Masjid F1014",
106
- "F.CNIC": "61101-9729652-7",
107
- "S.CNIC": "61101-8018797-4",
108
- "M.Name": "Nasira"
109
- },
110
- {
111
- "S_No": "2",
112
- "Admission No.": "1640",
113
- "Date of Admission": "05-10-20",
114
- "Name of Student": "Areej Jibran",
115
- "Father's Name": "M.Jibran",
116
- "Date of Birth": "05-04-14",
117
- "Telephone No.": "03335173534",
118
- "Address": "H#65 st#11 G11/I isb",
119
- "F. CNIC": "37405-0393951-3",
120
- "S.CNIC": "37405-5642572-3",
121
- "M.Name": "Taqdees Jibran"
122
- }
123
- ]
124
  }
125
-
126
- """
 
 
127
 
128
 
129
 
 
54
  error_pages.append(page_identifier)
55
  return []
56
 
57
+ prompt = """**Objective:** Extract specific data from a table within an image using OCR.
58
+
59
+ **Image Description:** The image contains a table with student information.
60
+
61
+ **Columns of Interest:**
62
+
63
+ * S.No (Serial Number)
64
+ * Admission No.
65
+ * Date of Admission
66
+ * Name of Student
67
+ * Father's Name
68
+ * Date of Birth
69
+ * Telephone No.
70
+ * Address
71
+ * F.CNIC (Father's CNIC)
72
+ * S.CNIC (Student's CNIC) - Located under the "REMARKS" column
73
+ * M.Name (Mother's Name) - Located under the "REMARKS" column
74
+
75
+ **Instructions:**
76
+
77
+ 1. **Perform OCR:** Use Optical Character Recognition to extract text from the image.
78
+ 2. **Table Detection:** Identify the table within the image.
79
+ 3. **Column Identification:**
80
+ * If table headers are present and visible, use them to identify the columns of interest.
81
+ * If headers are missing or unclear, assume the order of columns as specified above.
82
+ 4. **Data Extraction:**
83
+ * Extract data from each row of the table for the specified columns only.
84
+ * Disregard any additional columns present in the table.
85
+ * **Important:** Extract data from all rows, do not skip any rows.
86
+ * For "Telephone No.", focus on the number itself and ignore any labels like "office" or "residence" associated with it.
87
+ * For "F.CNIC", "S.CNIC", and "M.Name", extract this information from the "REMARKS" column.
88
+ 5. **Data Verification:**
89
+ * Implement checks to ensure the accuracy of extracted data, especially for numerical values like "S.No" and "Telephone No."
90
+ * Consider using checksums or validation rules based on known formats (e.g., CNIC format).
91
+
92
+ **Output Format:**
93
+
94
+ ```json
95
+ {
96
+ "data": [
97
  {
98
+ "S_No": "1",
99
+ "Admission No.": "1604",
100
+ "Date of Admission": "25-4-17",
101
+ "Name of Student": "Maham Tariq",
102
+ "Father's Name": "Tariq Mehman",
103
+ "Date of Birth": "12-05-12",
104
+ "Telephone No.": "03125350838",
105
+ "Address": "Dewan-e-umar Masjid F1014",
106
+ "F.CNIC": "61101-9729652-7",
107
+ "S.CNIC": "61101-8018797-4",
108
+ "M.Name": "Nasira"
109
+ },
110
+ {
111
+ "S_No": "2",
112
+ "Admission No.": "1640",
113
+ "Date of Admission": "05-10-20",
114
+ "Name of Student": "Areej Jibran",
115
+ "Father's Name": "M.Jibran",
116
+ "Date of Birth": "05-04-14",
117
+ "Telephone No.": "03335173534",
118
+ "Address": "H#65 st#11 G11/I isb",
119
+ "F. CNIC": "37405-0393951-3",
120
+ "S.CNIC": "37405-5642572-3",
121
+ "M.Name": "Taqdees Jibran"
 
 
 
 
122
  }
123
+ ]
124
+ }
125
+
126
+ """
127
 
128
 
129