Coverage for verification/text_verify.py : 72%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
""" ---------------------------------------------------------------------- Authors: Jan-Justin van Tonder ---------------------------------------------------------------------- Contains the logic used to verify the extracted text from a form of ID. ---------------------------------------------------------------------- """
import Levenshtein from hutts_utils.hutts_logger import logger, prettify_json_message
class TextVerify: """ This class is responsible for the verification of text that is extracted from an ID.
Authors: Jan-Justin van Tonder """
def __init__(self): """ Initialises the TextVerify object.
Authors: Jan-Justin van Tonder """ # Logging for debugging purposes.
def verify(self, extracted, verifier, threshold=75.00, min_matches=4, verbose=False): """ This function is responsible for the verification of text that is extracted from an ID and is passed in, along with information that is to be used to verify the extracted text.
Args: extracted (dict): A dictionary containing the information that was extracted from an ID. verifier (dict): A dictionary containing the information against which the extracted data is to be verified. threshold (float): A threshold percentage (out of 100) that is used to determine whether or not the final match percentage is accepted as verified. min_matches (int): The minimum number of matches that have to be calculated for the final result to be considered as verified. verbose (bool): Indicates whether or not to return all of the calculated match percentages.
Returns: (bool, float | dict): The first value returned is a bool that indicates whether or not the total percentage match is above the specified threshold value, while the second return value is the total percentage match value if verbose is False, or returns a dict of all the determined percentage match values if verbose is True.
Raises: TypeError: If extracted is not a dictionary. TypeError: If verifier is not a dictionary. TypeError: If threshold is not a float. TypeError: If min_matches is not an integer. TypeError: If verbose is not a boolean. """ 'Bad type for arg extracted - expected dict. Received type "%s"' % type(extracted).__name__ ) 'Bad type for arg verifier - expected dict. Received type "%s"' % type(verifier).__name__ ) 'Bad type for arg threshold - expected float. Received type "%s"' % type(threshold).__name__ ) 'Bad type for arg min_matches - expected int. Received type "%s"' % type(min_matches).__name__ ) 'Bad type for arg verbose - expected bool. Received type "%s"' % type(verbose).__name__ ) # Set minimum number of matches, if zero or less set to one. # Logging for debugging and verbose purposes. # Prettify and log the extracted information. # Prettify and log the verifier information. # Initialise a dictionary to house the final matching percentages. # Iterate over the verifier and calculate a percentage match for the values, # if the keys match and the corresponding values exist. # Compute the match percentage. 'match_percentage': self._match_percentage(value, extracted[key]), 'verifier_field_value': value, 'extracted_field_value': extracted[key] } '"%s" and "%s" match percentage: %.2f' % (value, extracted[key], match_percentages[key]['match_percentage']) ) else: # Determine the number of percentages calculated and initialise a default value for the total match score. # Check if enough matches were found. # Calculate the total match score. # Either the minimum number of percentages criteria was not met. else: # Determine whether or not the text is verified. # Logging for debugging purposes. # Return the final result. # Append the total and non-matches to the existing percentages for verbose purposes, # and return all percentage values.
@staticmethod def _match_percentage(str_x, str_y): """ This function is responsible for determining the percentage match for two strings and returning said percentage.
Authors: Jan-Justin van Tonder
Args: str_x (str): The first string that is used to perform matching. str_y (str): The second string that is used to perform matching.
Returns: (float): Match percentage of the two given strings.
Raises: TypeError: If str_x is not a string. TypeError: If str_y is not a string. """ 'Bad type for arg str_x - expected string. Received type "%s"' % type(str_x).__name__ ) 'Bad type for arg str_y - expected string. Received type "%s"' % type(str_y).__name__ )
@staticmethod def _total_percentage_match(matches): """ This function is responsible for calculating a single, total percentage match value for a dict of match values that have been calculated.
Authors: Jan-Justin van Tonder
Args: matches (dict): A dictionary of pre-calculated, match percentages.
Returns: (float): A total match percentage (out of 100) for a given set of match percentages.
Todo: Investigate the proposal of calculating a weighted total. """
@staticmethod def _get_non_matches(extracted, verifier): """ Creates a dictionary containing fields for which matches could not be computed, due to non-existence of fields or field values.
Author: Jan-Justin van Tonder
Args: extracted (dict): A dictionary containing the information that was extracted from an ID. verifier (dict): A dictionary containing the information against which the extracted data is to be verified.
Returns: (dict): A dictionary containing fields for which no matches can be found. """ # Iterate over the extracted and verifier dictionaries to determine the field values for which match # percentages cannot be computed due to non-existence of values. # There exists no corresponding field or field value for the verifier in the extracted ID info. 'match_percentage': None, 'verifier_field_value': verify_value, 'extracted_field_value': None } # There exists no corresponding field or field value for the extracted ID info in the verifier. 'match_percentage': None, 'verifier_field_value': None, 'extracted_field_value': extract_value }
def validate_id_number(self, id_number, valid_length=13): """ Determines whether a given id number is valid or not.
Args: id_number (str): valid_length (int): Specifies the length of a given id number to be considered as valid.
Returns: (bool): True if the id number is valid, False otherwise.
Raises: TypeError: If id_number is not a string containing only numeric characters. TypeError: If valid_length is not an integer. """ if (not isinstance(id_number, str)) or (isinstance(id_number, str) and not id_number.isnumeric()): raise TypeError( 'Bad type for arg id_number - expected string of ONLY numeric characters. Received type "%s"' % type(id_number).__name__ ) if not isinstance(valid_length, int): raise TypeError( 'Bad type for arg valid_length - expected integer. Received type "%s"' % type(valid_length).__name__ ) # Logging for debugging purposes. logger.debug('Checking if extracted id number is valid...') # Determine if the id number is of a valid length. is_valid_length = len(id_number) == valid_length logger.debug('Extracted id number length appears %s' % ('valid' if is_valid_length else 'invalid')) # Return early since the result will be false anyways. # Do not calculate the checksum if it is not required. if not is_valid_length: logger.debug('Extracted id number appears invalid') return False # Determine if the id number checksum is valid. is_valid_id_checksum = self._compute_checksum(id_number) == 0 # Both the length and the checksum must be valid for the entire id number to be valid. is_valid_id_number = is_valid_length and is_valid_id_checksum # Logging for debugging purposes. logger.debug('Extracted id number checksum appears %s' % ('valid' if is_valid_id_checksum else 'invalid')) logger.debug('Extracted id number appears %s' % ('valid' if is_valid_id_number else 'invalid')) # Return final result of validation. return is_valid_id_number
@staticmethod def _compute_checksum(id_number): """ Compute the Luhn checksum for the given id number string for validation.
Authors: Jan-Justin van Tonder
Args: id_number (str): A string containing an id number for which the Luhn checksum is to be calculated.
Returns: (int): Luhn checksum value for validation. """ # Map the digits of the given id number to new integers and create a list from said mapping. digits = list(map(int, id_number)) # Create a sum of the even digits by multiplying each digit by 2, performing mod 10 division and summing # the resultant digits. even_partial_sum = [sum(divmod(2 * digit, 10)) for digit in digits[-2::-2]] even_sum = sum(even_partial_sum) # Sum all the odd positioned digits. odd_sum = sum(digits[-1::-2]) # Return the Luhn checksum value for validation. return (even_sum + odd_sum) % 10 |