"""
This file contains the abstraction of all ID contexts, which contain
the necessary information and settings specific to a particular ID
document type.
"""
import re
from abc import ABC, abstractmethod
from enum import Enum
from hutts_verification.utils.hutts_logger import logger
__author__ = "Jan-Justin van Tonder"
__copyright__ = "Copyright 2017, Java the Hutts"
__license__ = "BSD"
__maintainer__ = "Jan-Justin van Tonder"
__email__ = "J.vanTonder@tuks.co.za"
__status__ = "Development"
[docs]class IDContext(ABC):
"""
This class is an abstraction of all ID contexts.
The ID contexts serve to contain the necessary information and settings specific to a particular ID
document type for operations such as extracting information from an ID OCR string.
:match_contexts (list): A list of dictionaries that contain the contextual information used in
the process of retrieving field values from the OCR output string.
Example::
{
'field': 'surname', # The field name - can be set to any string one desires.
'find': 'surname', # A string to be used for matching field names.
# in the OCR output string (used to know what to look for).
'field_type': # Indicates if the field value is to be treated as alphanumeric or
FieldType.TEXT_ONLY # just numeric or just alphabetical characters.
# (e.g. indicates that all numbers from field value should be removed
# if the field type is TEXT_ONLY).
'line_type': TITLED_NEWLINE # Indicates the type of line to be considered when looking for the
# field value relative to the 'find' value.
# (e.g. TITLED_NEWLINE indicates that the field value is preceded
# by a field name/title and a newline).
'multi_line': True, # Indicates that the field value spans multiple lines.
'multi_line_end': 'names' # (Optional, unless multi_line is true) A string identifying the next
# field name that indicates the end of the multi-line field value.
'to_uppercase': False, # (Optional) Indicates that the retrieved field value must be
# converted to uppercase.
}
"""
def __init__(self, match_contexts):
"""
Responsible for initialising the IDContext object.
:param match_contexts (list): A list of dictionaries that contain the contextual information
used in the process of retrieving field values from the OCR output string.
"""
# Logging for debugging purposes.
logger.debug('Initialising %s...' % type(self).__name__)
# Assign match contexts
self._match_contexts = match_contexts
[docs] def get_id_info(self, id_string, barcode_data=None, ignore_fields=None, fuzzy_min_ratio=60.0, max_multi_line=2):
"""
Responsible for filtering undesirable fields to be retrieved as well as delegating the responsibility
of extracting ID information from an OCR string and housing said information in a convenient dictionary.
Some type checking is done to reduce the likelihood of errors further down the call stack.
:param id_string: A string containing some ID information.
:param barcode_data: A dictionary object containing information extracted from a barcode.
:param ignore_fields: A list containing fields which are to be ignored during extraction.
:param fuzzy_min_ratio: The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing
two strings.
:param max_multi_line: Specifies the maximum number of lines that is to be extracted from fields that are
noted as running onto multiple lines.
Returns:
- (dict): A dictionary object containing the relevant, extracted ID information.
Raises:
- TypeError: If id_string is not a string.
- TypeError: If barcode_data is not a dictionary.
"""
# Check if arguments passed in are the correct type.
if not isinstance(id_string, str):
raise TypeError(
'Bad type for arg id_string - expected string. Received type "%s".' %
type(id_string).__name__
)
if barcode_data and not isinstance(barcode_data, dict):
raise TypeError(
'Bad type for arg barcode_data - expected dictionary. Received type "%s".' %
type(barcode_data).__name__
)
if ignore_fields and (not isinstance(ignore_fields, list) or not isinstance(ignore_fields[0], str)):
raise TypeError(
'Bad type for arg ignore_fields - expected list of strings. Received type "%s".' %
type(ignore_fields).__name__
)
if not isinstance(fuzzy_min_ratio, float):
raise TypeError(
'Bad type for arg fuzzy_min_ratio - expected float. Received type "%s".' %
type(fuzzy_min_ratio).__name__
)
if not isinstance(max_multi_line, int):
raise TypeError(
'Bad type for arg max_multi_line - expected int. Received type "%s".' %
type(max_multi_line).__name__
)
# Initialise a match context list for extraction.
match_contexts = self._match_contexts[:]
# Check if filtering is necessary.
if ignore_fields is not None:
# Filter out the fields that are to be ignored.
match_contexts = self._filter_ignore_match_contexts(ignore_fields)
# Extract ID information and house it in a dictionary, which is returned.
return self._dictify(match_contexts, id_string, barcode_data, fuzzy_min_ratio, max_multi_line)
def _filter_ignore_match_contexts(self, ignore_fields):
"""
Filters out fields which are to be ignored from the match_contexts.
:param ignore_fields (list): A list containing fields which are to be ignored during extraction.
Returns:
- (dict): A filtered list of match contexts.
"""
filtered_match_contexts = []
for match_context in self._match_contexts:
if match_context['field'] not in ignore_fields:
filtered_match_contexts.append(match_context)
return filtered_match_contexts
@abstractmethod
def _dictify(self, match_contexts, id_string, barcode_data, fuzzy_min_ratio, max_multi_line):
"""
Abstract method for subclasses to implement.
Meant to extract ID information from a string and, possibly, barcode data, which is to be returned
in a convenient dictionary format.
:param match_contexts (list): A list of dictionaries that contain the contextual information used
in the process of retrieving field values from the OCR output string.
:param id_string (str): A string containing some ID information.
:param barcode_data (dict): A dictionary object containing information extracted from a barcode.
:param fuzzy_min_ratio (float): The threshold ratio for a minimum, acceptable ratio of fuzziness
when comparing two strings.
:param max_multi_line (int): Specifies the maximum number of lines that is to be extracted
from fields that are noted as running onto multiple lines.
"""
pass
@staticmethod
def _normalise_match(match_context, match):
"""
Normalises a given match string according to the context it was matched.
:param match_context (dict): A dictionary that provides context for the information
that is to be extracted.
:param match (str): A string containing matched ID information.
Returns:
- (str): A match string normalised according to its matched context.
"""
# If the field value should only be text, strip everything that is numeric.
if match_context['field_type'] == FieldType.TEXT_ONLY:
match = re.sub(r'[\d]', '', match)
# If the field value ought to be numeric only, strip everything that is not numeric.
elif match_context['field_type'] == FieldType.NUMERIC_ONLY:
match = re.sub(r'[^\d]', '', match)
elif match_context['field_type'] == FieldType.DATE_HYPHENATED:
match = re.sub(r'[^\d-]', '', match)
# Check if conversion to uppercase was specified.
if 'to_uppercase' in match_context and match_context['to_uppercase']:
match = match.upper()
# If the field value does not require to be converted to uppercase.
elif 'to_uppercase' in match_context and not match_context['to_uppercase']:
# Convert to lowercase and capitalise the character of each new word.
match = match.lower().title()
return match
[docs]class FieldType(Enum):
"""
An enumerator used to specify the field type for extracted ID information.
"""
TEXT_ONLY = 0
NUMERIC_ONLY = 1
MIXED = 2
DATE_HYPHENATED = 3
[docs]class LineType(Enum):
"""
An enumerator used to specify the line type for extracted ID information.
"""
TITLED_NEWLINE = 0
TITLED_ADJACENT = 1
UNTITLED_NEWLINE = 2
UNTITLED_ADJACENT = 3