Source code for hutts_verification.id_contexts.id_context

"""
This file contains the abstraction of all ID contexts, which contain
the necessary information and settings specific to a particular ID
document type.

"""

import re
from abc import ABC, abstractmethod
from enum import Enum
from hutts_verification.utils.hutts_logger import logger

__author__ = "Jan-Justin van Tonder"
__copyright__ = "Copyright 2017, Java the Hutts"
__license__ = "BSD"
__maintainer__ = "Jan-Justin van Tonder"
__email__ = "J.vanTonder@tuks.co.za"
__status__ = "Development"


[docs]class IDContext(ABC): """ This class is an abstraction of all ID contexts. The ID contexts serve to contain the necessary information and settings specific to a particular ID document type for operations such as extracting information from an ID OCR string. :match_contexts (list): A list of dictionaries that contain the contextual information used in the process of retrieving field values from the OCR output string. Example:: { 'field': 'surname', # The field name - can be set to any string one desires. 'find': 'surname', # A string to be used for matching field names. # in the OCR output string (used to know what to look for). 'field_type': # Indicates if the field value is to be treated as alphanumeric or FieldType.TEXT_ONLY # just numeric or just alphabetical characters. # (e.g. indicates that all numbers from field value should be removed # if the field type is TEXT_ONLY). 'line_type': TITLED_NEWLINE # Indicates the type of line to be considered when looking for the # field value relative to the 'find' value. # (e.g. TITLED_NEWLINE indicates that the field value is preceded # by a field name/title and a newline). 'multi_line': True, # Indicates that the field value spans multiple lines. 'multi_line_end': 'names' # (Optional, unless multi_line is true) A string identifying the next # field name that indicates the end of the multi-line field value. 'to_uppercase': False, # (Optional) Indicates that the retrieved field value must be # converted to uppercase. } """ def __init__(self, match_contexts): """ Responsible for initialising the IDContext object. :param match_contexts (list): A list of dictionaries that contain the contextual information used in the process of retrieving field values from the OCR output string. """ # Logging for debugging purposes. logger.debug('Initialising %s...' % type(self).__name__) # Assign match contexts self._match_contexts = match_contexts
[docs] def get_id_info(self, id_string, barcode_data=None, ignore_fields=None, fuzzy_min_ratio=60.0, max_multi_line=2): """ Responsible for filtering undesirable fields to be retrieved as well as delegating the responsibility of extracting ID information from an OCR string and housing said information in a convenient dictionary. Some type checking is done to reduce the likelihood of errors further down the call stack. :param id_string: A string containing some ID information. :param barcode_data: A dictionary object containing information extracted from a barcode. :param ignore_fields: A list containing fields which are to be ignored during extraction. :param fuzzy_min_ratio: The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing two strings. :param max_multi_line: Specifies the maximum number of lines that is to be extracted from fields that are noted as running onto multiple lines. Returns: - (dict): A dictionary object containing the relevant, extracted ID information. Raises: - TypeError: If id_string is not a string. - TypeError: If barcode_data is not a dictionary. """ # Check if arguments passed in are the correct type. if not isinstance(id_string, str): raise TypeError( 'Bad type for arg id_string - expected string. Received type "%s".' % type(id_string).__name__ ) if barcode_data and not isinstance(barcode_data, dict): raise TypeError( 'Bad type for arg barcode_data - expected dictionary. Received type "%s".' % type(barcode_data).__name__ ) if ignore_fields and (not isinstance(ignore_fields, list) or not isinstance(ignore_fields[0], str)): raise TypeError( 'Bad type for arg ignore_fields - expected list of strings. Received type "%s".' % type(ignore_fields).__name__ ) if not isinstance(fuzzy_min_ratio, float): raise TypeError( 'Bad type for arg fuzzy_min_ratio - expected float. Received type "%s".' % type(fuzzy_min_ratio).__name__ ) if not isinstance(max_multi_line, int): raise TypeError( 'Bad type for arg max_multi_line - expected int. Received type "%s".' % type(max_multi_line).__name__ ) # Initialise a match context list for extraction. match_contexts = self._match_contexts[:] # Check if filtering is necessary. if ignore_fields is not None: # Filter out the fields that are to be ignored. match_contexts = self._filter_ignore_match_contexts(ignore_fields) # Extract ID information and house it in a dictionary, which is returned. return self._dictify(match_contexts, id_string, barcode_data, fuzzy_min_ratio, max_multi_line)
def _filter_ignore_match_contexts(self, ignore_fields): """ Filters out fields which are to be ignored from the match_contexts. :param ignore_fields (list): A list containing fields which are to be ignored during extraction. Returns: - (dict): A filtered list of match contexts. """ filtered_match_contexts = [] for match_context in self._match_contexts: if match_context['field'] not in ignore_fields: filtered_match_contexts.append(match_context) return filtered_match_contexts @abstractmethod def _dictify(self, match_contexts, id_string, barcode_data, fuzzy_min_ratio, max_multi_line): """ Abstract method for subclasses to implement. Meant to extract ID information from a string and, possibly, barcode data, which is to be returned in a convenient dictionary format. :param match_contexts (list): A list of dictionaries that contain the contextual information used in the process of retrieving field values from the OCR output string. :param id_string (str): A string containing some ID information. :param barcode_data (dict): A dictionary object containing information extracted from a barcode. :param fuzzy_min_ratio (float): The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing two strings. :param max_multi_line (int): Specifies the maximum number of lines that is to be extracted from fields that are noted as running onto multiple lines. """ pass @staticmethod def _normalise_match(match_context, match): """ Normalises a given match string according to the context it was matched. :param match_context (dict): A dictionary that provides context for the information that is to be extracted. :param match (str): A string containing matched ID information. Returns: - (str): A match string normalised according to its matched context. """ # If the field value should only be text, strip everything that is numeric. if match_context['field_type'] == FieldType.TEXT_ONLY: match = re.sub(r'[\d]', '', match) # If the field value ought to be numeric only, strip everything that is not numeric. elif match_context['field_type'] == FieldType.NUMERIC_ONLY: match = re.sub(r'[^\d]', '', match) elif match_context['field_type'] == FieldType.DATE_HYPHENATED: match = re.sub(r'[^\d-]', '', match) # Check if conversion to uppercase was specified. if 'to_uppercase' in match_context and match_context['to_uppercase']: match = match.upper() # If the field value does not require to be converted to uppercase. elif 'to_uppercase' in match_context and not match_context['to_uppercase']: # Convert to lowercase and capitalise the character of each new word. match = match.lower().title() return match
[docs]class FieldType(Enum): """ An enumerator used to specify the field type for extracted ID information. """ TEXT_ONLY = 0 NUMERIC_ONLY = 1 MIXED = 2 DATE_HYPHENATED = 3
[docs]class LineType(Enum): """ An enumerator used to specify the line type for extracted ID information. """ TITLED_NEWLINE = 0 TITLED_ADJACENT = 1 UNTITLED_NEWLINE = 2 UNTITLED_ADJACENT = 3