Source code for hutts_verification.image_processing.text_cleaner

"""
This file contains the logic used to manage the removal of characters
from an input string.
"""

import re
from hutts_verification.utils.hutts_logger import logger

__author__ = "Jan-Justin van Tonder"
__copyright__ = "Copyright 2017, Java the Hutts"
__license__ = "BSD"
__maintainer__ = "Jan-Justin van Tonder"
__email__ = "J.vanTonder@tuks.co.za"
__status__ = "Development"


[docs]class TextCleaner: """ This class is encapsulates the logic required to clean the OCR output string produced from an image of an ID. :_deplorables (list): A list of strings that contain characters that is to be filtered out from the OCR output string during string cleaning. """ def __init__(self): """ Responsible for initialising the TextCleaner object. """ # Logging for debugging purposes. logger.debug('Initialising %s...' % type(self).__name__) # Specify initial list of undesirable characters. self._deplorables = ['_']
[docs] def clean_up(self, in_string, deplorables=None): """ This function serves to receive an input string, clean it up through removing undesirable characters and unnecessary whitespace, and to return the cleaned string. :param in_string (str): The input string that is to be cleaned. :param deplorables (list): A list of characters that are to be filtered from the input string. Returns: - (str): A string that has been stripped of undesirable characters and unnecessary whitespace. Raises: - TypeError: If in_string is not a string. - TypeError: If deplorables is not a list of strings. """ # Check if the correct argument types have been passed in. if not isinstance(in_string, str): raise TypeError( 'Bad type for arg in_string - expected string. Received type "%s".' % type(in_string).__name__ ) if deplorables and (not isinstance(deplorables, list) or not isinstance(deplorables[0], str)): raise TypeError( 'Bad type for arg deplorables - expected list of strings. Received type "%s".' % type(deplorables).__name__ ) # Remove undesirable characters, spaces and newlines. compiled_deplorable_re = self._compile_deplorables(deplorables) sanitised = re.sub(compiled_deplorable_re, '', in_string) # Remove empty lines in-between text-filled lines. stripped_and_sanitised = re.sub(r'(\n\s*\n)', '\n', sanitised) # Remove multiple spaces before and after text-filled line. clean_text = re.sub(r'(\s*\n\s*)', '\n', stripped_and_sanitised) # Remove multiple spaces in-between text-filled line. clean_text = re.sub(r'( +)', ' ', clean_text) # Lastly, strip the trailing and leading spaces. clean_text = clean_text.strip() # Return cleaned text. return clean_text
def _compile_deplorables(self, deplorables): """ This function is responsible for compiling a regex pattern that is used to filter out the characters that were deemed undesirable from a string. :param deplorables (list): A list of characters that are to be filtered from the input string. Returns: - (obj): A compiled regex pattern used to match undesirable characters in a string. """ # Append to existing list of undesirable characters if there is a given list of # undesirable characters if deplorables is not None: # Escape and append the list of undesirable characters. self._deplorables += re.escape(''.join(deplorables)) # Define a class of characters that we wish to keep for the regex # that is to be compiled. reg_exp = r'[^\w\d\s-]' # If the existing list of undesirable characters is not empty, # add the list of undesirable characters to the regex that is to be compiled. reg_exp += r'|[' + ''.join(self._deplorables) + ']' # Returned a compiled regular expression pattern to use for matching. return re.compile(reg_exp, re.UNICODE)