"""
This file contains the abstraction of all ID contexts, which contain
the necessary information and settings specific to a particular ID
document type.
"""
import re
from abc import ABC, abstractmethod
from enum import Enum
from hutts_utils.hutts_logger import logger
__author__ = "Jan-Justin van Tonder"
__copyright__ = "Copyright 2017, Java the Hutts"
__license__ = "BSD"
__maintainer__ = "Jan-Justin van Tonder"
__email__ = "J.vanTonder@tuks.co.za"
__status__ = "Development"
[docs]class IDContext(ABC):
"""
This class is an abstraction of all ID contexts.
The ID contexts serve to contain the necessary information and settings specific to a particular ID
document type for operations such as extracting information from an ID OCR string.
Attributes:
match_contexts (list): A list of dictionaries that contain the contextual information used in the process of
retrieving field values from the OCR output string.
e.g. {
'field': 'surname', // The field name - can be set to any string one desires.
'find': 'surname', // A string to be used for matching field names.
// in the OCR output string (used to know what to look for).
'field_type': // Indicates if the field value is to be treated as alphanumeric or
FieldType.TEXT_ONLY // just numeric or just alphabetical characters.
// (e.g. indicates that all numbers from field value should be removed
// if the field type is TEXT_ONLY).
'line_type': TITLED_NEWLINE // Indicates the type of line to be considered when looking for the
// field value relative to the 'find' value.
// (e.g. TITLED_NEWLINE indicates that the field value is preceded
// by a field name/title and a newline).
'multi_line': True, // Indicates that the field value spans multiple lines.
'multi_line_end': 'names' // (Optional, unless multi_line is true) A string identifying the next
// field name that indicates the end of the multi-line field value.
'to_uppercase': False, // (Optional) Indicates that the retrieved field value must be
// converted to uppercase.
}
"""
def __init__(self, match_contexts):
"""
Responsible for initialising the IDContext object.
Args:
match_contexts (list): A list of dictionaries that contain the contextual information used in the process
of retrieving field values from the OCR output string.
"""
# Logging for debugging purposes.
logger.debug('Initialising %s...' % type(self).__name__)
# Assign match contexts
self._match_contexts = match_contexts
[docs] def get_id_info(self, id_string, barcode_data=None, ignore_fields=None, fuzzy_min_ratio=60.0, max_multi_line=2):
"""
Responsible for filtering undesirable fields to be retrieved as well as delegating the responsibility
of extracting ID information from an OCR string and housing said information in a convenient dictionary.
Some type checking is done to reduce the likelihood of errors further down the call stack.
Args:
id_string (str): A string containing some ID information.
barcode_data (dict, Optional): A dictionary object containing information extracted from a barcode.
ignore_fields (list, Optional): A list containing fields which are to be ignored during extraction.
fuzzy_min_ratio (float): The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing
two strings.
max_multi_line (int): Specifies the maximum number of lines that is to be extracted from fields that are
noted as running onto multiple lines.
Returns:
(dict): A dictionary object containing the relevant, extracted ID information.
Raises:
TypeError: If id_string is not a string.
TypeError: If barcode_data is not a dictionary.
"""
# Check if arguments passed in are the correct type.
if not isinstance(id_string, str):
raise TypeError(
'Bad type for arg id_string - expected string. Received type "%s".' %
type(id_string).__name__
)
if barcode_data and not isinstance(barcode_data, dict):
raise TypeError(
'Bad type for arg barcode_data - expected dictionary. Received type "%s".' %
type(barcode_data).__name__
)
if ignore_fields and (not isinstance(ignore_fields, list) or not isinstance(ignore_fields[0], str)):
raise TypeError(
'Bad type for arg ignore_fields - expected list of strings. Received type "%s".' %
type(ignore_fields).__name__
)
if not isinstance(fuzzy_min_ratio, float):
raise TypeError(
'Bad type for arg fuzzy_min_ratio - expected float. Received type "%s".' %
type(fuzzy_min_ratio).__name__
)
if not isinstance(max_multi_line, int):
raise TypeError(
'Bad type for arg max_multi_line - expected int. Received type "%s".' %
type(max_multi_line).__name__
)
# Initialise a match context list for extraction.
match_contexts = self._match_contexts[:]
# Check if filtering is necessary.
if ignore_fields is not None:
# Filter out the fields that are to be ignored.
match_contexts = self._filter_ignore_match_contexts(ignore_fields)
# Extract ID information and house it in a dictionary, which is returned.
return self._dictify(match_contexts, id_string, barcode_data, fuzzy_min_ratio, max_multi_line)
def _filter_ignore_match_contexts(self, ignore_fields):
"""
Filters out fields which are to be ignored from the match_contexts.
Args:
ignore_fields (list): A list containing fields which are to be ignored during extraction.
Returns:
(dict): A filtered list of match contexts.
"""
filtered_match_contexts = []
for match_context in self._match_contexts:
if match_context['field'] not in ignore_fields:
filtered_match_contexts.append(match_context)
return filtered_match_contexts
@abstractmethod
def _dictify(self, match_contexts, id_string, barcode_data, fuzzy_min_ratio, max_multi_line):
"""
Abstract method for subclasses to implement.
Meant to extract ID information from a string and, possibly, barcode data, which is to be returned
in a convenient dictionary format.
Args:
match_contexts (list): A list of dictionaries that contain the contextual information used in the process
of retrieving field values from the OCR output string.
id_string (str): A string containing some ID information.
barcode_data (dict, Optional): A dictionary object containing information extracted from a barcode.
fuzzy_min_ratio (float): The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing
two strings.
max_multi_line (int): Specifies the maximum number of lines that is to be extracted from fields that are
noted as running onto multiple lines.
"""
pass
@staticmethod
def _normalise_match(match_context, match):
"""
Normalises a given match string according to the context it was matched.
Args:
match_context (dict): A dictionary object that provides context for the information that is to be extracted.
match (str): A string containing matched ID information.
Returns:
(str): A match string normalised according to its matched context.
"""
# If the field value should only be text, strip everything that is numeric.
if match_context['field_type'] == FieldType.TEXT_ONLY:
match = re.sub(r'[^\w\s-]', '', match)
# If the field value ought to be numeric only, strip everything that is not numeric.
elif match_context['field_type'] == FieldType.NUMERIC_ONLY:
match = re.sub(r'[^\d]', '', match)
elif match_context['field_type'] == FieldType.DATE_HYPHENATED:
match = re.sub(r'[^\d-]', '', match)
# Check if conversion to uppercase was specified.
if 'to_uppercase' in match_context and match_context['to_uppercase']:
match = match.upper()
# If the field value does not require to be converted to uppercase.
elif 'to_uppercase' in match_context and not match_context['to_uppercase']:
# Convert to lowercase and capitalise the character of each new word.
match = match.lower().title()
return match
[docs]class FieldType(Enum):
"""
An enumerator used to specify the field type for extracted ID information.
"""
TEXT_ONLY = 0
NUMERIC_ONLY = 1
MIXED = 2
DATE_HYPHENATED = 3
[docs]class LineType(Enum):
"""
An enumerator used to specify the line type for extracted ID information.
"""
TITLED_NEWLINE = 0
TITLED_ADJACENT = 1
UNTITLED_NEWLINE = 2
UNTITLED_ADJACENT = 3