Update main.py
Browse files
main.py
CHANGED
@@ -447,11 +447,25 @@ class QualityAssessor:
|
|
447 |
field_scores = result.confidence_scores.copy()
|
448 |
consistency_score = self._check_consistency(result.data)
|
449 |
|
450 |
-
|
451 |
-
|
452 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
453 |
|
454 |
-
|
|
|
|
|
|
|
455 |
review_time = self._estimate_review_time(review_flags, field_scores)
|
456 |
|
457 |
return QualityReport(
|
@@ -467,19 +481,21 @@ class QualityAssessor:
|
|
467 |
required_fields = schema.get('required', [])
|
468 |
properties = schema.get('properties', {})
|
469 |
|
470 |
-
|
471 |
-
|
472 |
-
for field in required_fields:
|
473 |
-
if field not in data or data[field] is None:
|
474 |
-
score -= 0.2
|
475 |
|
|
|
|
|
476 |
for field, value in data.items():
|
477 |
if field in properties:
|
|
|
478 |
expected_type = properties[field].get('type')
|
479 |
if expected_type and not self._check_type(value, expected_type):
|
480 |
-
|
481 |
|
482 |
-
|
|
|
|
|
483 |
|
484 |
def _check_type(self, value: Any, expected_type: str) -> bool:
|
485 |
if value is None:
|
@@ -497,20 +513,49 @@ class QualityAssessor:
|
|
497 |
return isinstance(value, expected_python_type)
|
498 |
|
499 |
def _check_consistency(self, data: Dict[str, Any]) -> float:
|
500 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
501 |
|
502 |
-
def _generate_review_flags(self, field_scores: Dict[str, float], schema_compliance: float, overall_confidence: float) -> List[str]:
|
503 |
flags = []
|
504 |
|
505 |
-
if overall_confidence < 0.
|
506 |
-
flags.append("
|
|
|
|
|
507 |
|
508 |
if schema_compliance < 0.8:
|
509 |
flags.append("schema_compliance_issues")
|
510 |
|
511 |
-
low_confidence_fields = [field for field, score in field_scores.items() if score < 0.
|
512 |
if low_confidence_fields:
|
513 |
-
flags.append(f"
|
|
|
|
|
|
|
|
|
514 |
|
515 |
return flags
|
516 |
|
|
|
447 |
field_scores = result.confidence_scores.copy()
|
448 |
consistency_score = self._check_consistency(result.data)
|
449 |
|
450 |
+
required_fields = schema.get('required', [])
|
451 |
+
|
452 |
+
if field_scores:
|
453 |
+
total_weight = 0
|
454 |
+
weighted_confidence = 0
|
455 |
+
|
456 |
+
for field, confidence in field_scores.items():
|
457 |
+
weight = 2.0 if field in required_fields else 1.0
|
458 |
+
weighted_confidence += confidence * weight
|
459 |
+
total_weight += weight
|
460 |
+
|
461 |
+
avg_field_confidence = weighted_confidence / total_weight
|
462 |
+
else:
|
463 |
+
avg_field_confidence = 0
|
464 |
|
465 |
+
overall_confidence = avg_field_confidence * (0.8 + 0.2 * schema_compliance) * (0.9 + 0.1 * consistency_score)
|
466 |
+
overall_confidence = min(overall_confidence, 1.0)
|
467 |
+
|
468 |
+
review_flags = self._generate_review_flags(field_scores, schema_compliance, overall_confidence, required_fields, result.data)
|
469 |
review_time = self._estimate_review_time(review_flags, field_scores)
|
470 |
|
471 |
return QualityReport(
|
|
|
481 |
required_fields = schema.get('required', [])
|
482 |
properties = schema.get('properties', {})
|
483 |
|
484 |
+
required_present = sum(1 for field in required_fields if field in data and data[field] is not None)
|
485 |
+
required_compliance = required_present / len(required_fields) if required_fields else 1.0
|
|
|
|
|
|
|
486 |
|
487 |
+
type_errors = 0
|
488 |
+
total_fields = 0
|
489 |
for field, value in data.items():
|
490 |
if field in properties:
|
491 |
+
total_fields += 1
|
492 |
expected_type = properties[field].get('type')
|
493 |
if expected_type and not self._check_type(value, expected_type):
|
494 |
+
type_errors += 1
|
495 |
|
496 |
+
type_compliance = 1.0 - (type_errors / total_fields) if total_fields > 0 else 1.0
|
497 |
+
|
498 |
+
return (required_compliance * 0.7 + type_compliance * 0.3)
|
499 |
|
500 |
def _check_type(self, value: Any, expected_type: str) -> bool:
|
501 |
if value is None:
|
|
|
513 |
return isinstance(value, expected_python_type)
|
514 |
|
515 |
def _check_consistency(self, data: Dict[str, Any]) -> float:
|
516 |
+
consistency_score = 1.0
|
517 |
+
|
518 |
+
if 'email' in data and data['email']:
|
519 |
+
if '@' not in str(data['email']):
|
520 |
+
consistency_score -= 0.1
|
521 |
+
|
522 |
+
if 'startDate' in data and 'endDate' in data:
|
523 |
+
try:
|
524 |
+
if data['startDate'] and data['endDate']:
|
525 |
+
if str(data['startDate']) > str(data['endDate']):
|
526 |
+
consistency_score -= 0.15
|
527 |
+
except:
|
528 |
+
pass
|
529 |
+
|
530 |
+
if isinstance(data, dict):
|
531 |
+
for key, value in data.items():
|
532 |
+
if isinstance(value, list):
|
533 |
+
for item in value:
|
534 |
+
if isinstance(item, dict):
|
535 |
+
consistency_score *= self._check_consistency(item)
|
536 |
+
elif isinstance(value, dict):
|
537 |
+
consistency_score *= self._check_consistency(value)
|
538 |
+
|
539 |
+
return max(0.7, consistency_score)
|
540 |
|
541 |
+
def _generate_review_flags(self, field_scores: Dict[str, float], schema_compliance: float, overall_confidence: float, required_fields: List[str], extracted_data: Dict[str, Any]) -> List[str]:
|
542 |
flags = []
|
543 |
|
544 |
+
if overall_confidence < 0.6:
|
545 |
+
flags.append("high_priority_review")
|
546 |
+
elif overall_confidence < 0.8:
|
547 |
+
flags.append("standard_review")
|
548 |
|
549 |
if schema_compliance < 0.8:
|
550 |
flags.append("schema_compliance_issues")
|
551 |
|
552 |
+
low_confidence_fields = [field for field, score in field_scores.items() if score < 0.7]
|
553 |
if low_confidence_fields:
|
554 |
+
flags.append(f"uncertain_fields: {', '.join(low_confidence_fields[:3])}")
|
555 |
+
|
556 |
+
missing_required = [field for field in required_fields if field not in extracted_data or extracted_data[field] is None]
|
557 |
+
if missing_required:
|
558 |
+
flags.append(f"missing_required: {', '.join(missing_required[:3])}")
|
559 |
|
560 |
return flags
|
561 |
|