Coverage for id_contexts/sa_id.py : 87%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
""" ---------------------------------------------------------------------- Authors: Stephan Nell, Marno Hermann, Jan-Justin van Tonder ---------------------------------------------------------------------- This file contains the abstraction and high-level logic of South African ID contexts. ---------------------------------------------------------------------- """
import re from abc import abstractmethod from fuzzywuzzy import fuzz from datetime import datetime from id_contexts.id_context import IDContext, LineType from hutts_utils.hutts_logger import logger
class SAID(IDContext): """ An abstract class for South African IDs. Contains the high-level logic that is relevant to all South African IDs. """ # Define a class-level constant for a minimum fuzzy ratio during post-processing. POST_PROCESS_MIN_FUZZY_RATIO = 70.0
# Define a class-level constant for a valid ID number length. VALID_ID_LENGTH = 13
# Define a class-level constant as a minimum age year delta for year threshold. MIN_AGE_DELTA = 15
# Define a class-level constant as a year delta for year threshold. YEAR_DELTA = 100
def __init__(self, match_contexts): """ Initialises the SAID object.
Args: match_contexts (list): A list of dictionaries that contain the contextual information used in the process of retrieving field values from the OCR output string. """ # Logging for debugging purposes. # Initialise parent.
def _dictify(self, match_contexts, id_string, barcode_data, fuzzy_min_ratio, max_multi_line): """ This function is responsible for generating a dictionary object containing the relevant ID information, such as names, surname, ID number, etc., from a given input string containing said relevant information.
Authors: Jan-Justin van Tonder
Args: match_contexts (list): A list of dictionaries that contain the contextual information used in the process of retrieving field values from the OCR output string. id_string (str): A string containing some ID information. barcode_data (dict, Optional): A dictionary object containing information extracted from a barcode. fuzzy_min_ratio (float): The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing two strings. max_multi_line (int): Specifies the maximum number of lines that is to be extracted from fields that are noted as running onto multiple lines.
Returns: (dict): A dictionary object containing the relevant, extracted ID information. """ # Given a string containing extracted ID text, # create a dictionary object and populate it with # relevant information from said text. # Check if barcode data, containing the id number, exists and # if so, save it and extract some relevant information from it. # It should overwrite any existing fields that can be extracted from the id number, since # the information embedded within the id number is more reliable, at least theoretically. # Attempt to populate id_info with information from the given ID string. # Perform some custom post-processing on the information that was extracted. # Return the info that was found.
def _populate_id_information(self, match_contexts, id_string, id_info, fuzzy_min_ratio, max_multi_line): """ This function is responsible for populating a dictionary object with information that it is able to find and extract from a given string containing ID information.
Authors: Jan-Justin van Tonder
Args: id_string (str): A string containing some ID information. id_info (dict): A dictionary object used to house extracted ID information. fuzzy_min_ratio (float): The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing two strings. max_multi_line (int): Specifies the maximum number of lines that is to be extracted from fields that are noted as running onto multiple lines. """ # Split the id_string on the newline character to generate a list. # Attempt to retrieve matches. # Extract desired field name from context as key. # Only retrieve information if it does not exist or it could not previously # be determined. # If the ID number has been retrieved, use it to extract other useful information. # It should overwrite any existing fields that can be extracted from the id number, since # the information embedded within the id number is more reliable, at least theoretically.
def _get_match(self, id_string_list, match_context, fuzzy_min_ratio, max_multi_line): """ This function is responsible for searching through a list of lines from an ID string and extracting the relevant ID information based on some context for image_processing that is provided as input. Fuzzy string matching is performed on field names in order to extract field values. This process is assisted with a context that is is to be provided.
Authors: Jan-Justin van Tonder
Args: id_string_list (list): An ID string that has been broken down into a list of individual lines. match_context (dict): A dictionary object that provides context for the information that is to be extracted. fuzzy_min_ratio (float): The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing two strings. max_multi_line (int): Specifies the maximum number of lines that is to be extracted from fields that are noted as running onto multiple lines. e.g. Given OCR output such as : ... Names\n This is a long\n long list of names\n that spans multiple\n lines\n ... max_multi_line = 2, means that only the string: "This is a long list of names" is retrieved.
Returns: (str): A string containing the extracted information, if a match was found. (None): If nothing was matched or an extracted value is an empty string. """ # Set the most suitable fuzzy matching function based on line type. # Iterate over the id_string list to find fuzzy matches. # Check to see if we can jump ahead and ignore the current index. # Is there a match? # Set new best match ratio and retrieve the info if possible # Check for special cases of extraction. # Check to see if we are dealing with a field value on single line adjacent to the field name. # Check to see if we are going out of bounds of the string before proceeding. # We are only interested in field value, not field name. # e.g: Surname\n # Smith\n # ... # ignore 'Surname' so as to be able to manually specify field name # in the context settings. # Retrieve the field value on the very next line. # If the field value exists over multiple lines. # Determine the lower bound index for field values that span multiple lines. # There is nothing to find in this case. # Determine the upper bound index for field values that span multiple lines. # Don't go out of bounds. # Iterate ahead to retrieve the field value that spans over multiple lines. # For ech of the specified endpoints, check if the end of the field value has # been reached. match_context['multi_line_end'], id_string_list[forward_index] ) # Otherwise, add the line to the field value. # Check if a legitimate match was found before proceeding. # Normalise the match found. # Final check to see if an empty string ('', not None) is the match found, return None if this is the case. # Otherwise return what we have found.
@abstractmethod def _get_idiosyncratic_match(self, match_context, id_string_list, current_index): """ Abstract method to be implemented by subclasses. Meant to retrieve matches that are particular to a context of a subclass.
Args: match_context (dict): A dictionary object that provides context for the information that is to be extracted. id_string_list (list): An ID string that has been broken down into a list of individual lines. current_index (int): The current index within the ID string list. """ pass
@staticmethod def _id_number_information_extraction(match_contexts, id_info, id_number): """ This function is responsible for extracting information from a given ID number and populating a given dictionary object with the extracted information.
Authors: Marno Hermann Stephan Nell Jan-Justin van Tonder
Args: match_contexts (list): A list of dictionaries that contain the contextual information used in the process of retrieving field values from the OCR output string. id_info (dict): A dictionary object containing extracted ID information. id_number (str): An ID number. """ # Extract date of birth digits from ID number. # Populate id_info with date of birth. # Extract gender digit from ID Number. # Populate id_info with gender info. # Currently, the genders on South African IDs are binary, meaning an individual is # either male or female. # Extract status digit from ID Number. # Populate id_info with status info.
def _post_process(self, id_info): """ Used to perform custom processing after extraction has taken place. All custom operations that are required after all the extraction has taken place, should be called from within this function.
Authors: Jan-Justin van Tonder
Args: id_info (dict): A dictionary object used to house extracted ID information.
Returns: (dict): The original id_info, with some post-processed field values. """ # Check if date of birth field exists for post-processing. # Check if country of birth field exists for post-processing. # We should be fairly certain, with a margin for error, that we have a match. # Translate from Afrikaans to English
@staticmethod def _standardise_date_of_birth(date_of_birth): """ Standardises the date of birth field value due to a mixture of formats that can be extracted. Due to the preference of extracting the date of birth from the id number as opposed to the ocr output, there tends to be a discrepancy in the date format retrieved, therefore, standardise it for future use.
Authors: Jan-Justin van Tonder
Args: date_of_birth (str): The date of birth to be standardised.
Returns: (str): A standardised date of birth field value if the extracted format could be parsed, else the extracted format is kept. """ # Attempt to parse the different dates that could appear for formatting. # If the current date contains a '-', then it was extracted from the id number and '-' is the # third character in, parse it in the format 'YY-MM-DD' # If the current date contains a '-', then it was extracted from the id number, parse it in the # format 'YYYY-MM-DD' based on elimination of possibilities for this specific ID context. # Otherwise it was extracted from the OCR output, therefore, parse it in the # format 'DD MMM YYYY' else: # Check to see if the year is too far in the future. # Reduce the year by a defined year delta. replacement_year = standardised_date_of_birth.year - SAID.YEAR_DELTA standardised_date_of_birth = standardised_date_of_birth.replace(year=replacement_year) # Standardise the date by formatting it according to ISO date format standard, # which is 'YYYY-MM-DD' # Could not parse the date so log and keep it as is.
def validate_id_number(self, id_number): """ Determines whether a given id number is valid or not.
Args: id_number (str): An ID number that is to be validated.
Returns: (bool): True if the id number is valid, False otherwise.
Raises: TypeError: If id_number is not a string containing only numeric characters. """ 'Bad type for arg id_number - expected string of ONLY numeric characters. Received type "%s"' % type(id_number).__name__ ) # Logging for debugging purposes. # Determine if the id number is of a valid length. # Return early since the result will be false anyways. # Do not calculate the checksum if it is not required. # Determine if the id number checksum is valid. # Both the length and the checksum must be valid for the entire id number to be valid. # Logging for debugging purposes. # Return final result of validation.
@staticmethod def _compute_checksum(id_number): """ Compute the Luhn checksum for the given id number string for validation.
Authors: Jan-Justin van Tonder
Args: id_number (str): A string containing an id number for which the Luhn checksum is to be calculated.
Returns: (int): Luhn checksum value for validation. """ # Create a list of ID number digits parsed as integers. # Create a sum of the even digits by multiplying each digit by 2, performing mod 10 division and summing # the resultant digits. # Sum all the odd positioned digits. # Return the Luhn checksum value for validation. |