Coverage for image_processing/text_cleaner.py : 80%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
""" ---------------------------------------------------------------------- Authors: Jan-Justin van Tonder ---------------------------------------------------------------------- This file contains the logic used to manage the removal of characters from an input string. ---------------------------------------------------------------------- """
import re from hutts_utils.hutts_logger import logger
class TextCleaner: """ This class is encapsulates the logic required to clean the OCR output string produced from an image of an ID.
Attributes: _deplorables (list): A list of strings that contain characters that is to be filtered out from the OCR output string during string cleaning. """ def __init__(self): """ Responsible for initialising the TextCleaner object. """ # Logging for debugging purposes. # Specify initial list of undesirable characters.
def clean_up(self, in_string, deplorables=None): """ This function serves to receive an input string, clean it up through removing undesirable characters and unnecessary whitespace, and to return the cleaned string.
Authors: Jan-Justin van Tonder
Args: in_string (str): The input string that is to be cleaned. deplorables (list, Optional): A list of characters that are to be filtered from the input string.
Returns: str: A string that has been stripped of undesirable characters and unnecessary whitespace.
Raises: TypeError: If in_string is not a string. TypeError: If deplorables is not a list of strings. """ # Check if the correct argument types have been passed in. 'Bad type for arg in_string - expected string. Received type "%s".' % type(in_string).__name__ ) 'Bad type for arg deplorables - expected list of strings. Received type "%s".' % type(deplorables).__name__ ) # Remove undesirable characters, spaces and newlines. # Remove empty lines in-between text-filled lines. # Remove multiple spaces before and after text-filled line. # Remove multiple spaces in-between text-filled line. # Lastly, strip the trailing and leading spaces. # Return cleaned text.
def _compile_deplorables(self, deplorables): """ This function is responsible for compiling a regex pattern that is used to filter out the characters that were deemed undesirable from a string.
Authors: Jan-Justin van Tonder
Args: deplorables (list): A list of characters that are to be filtered from the input string.
Returns: A compiled regex pattern used to match undesirable characters in a string. """ # Append to existing list of undesirable characters if there is a given list of # undesirable characters # Escape and append the list of undesirable characters. # Define a class of characters that we wish to keep for the regex # that is to be compiled. # If the existing list of undesirable characters is not empty, # add the list of undesirable characters to the regex that is to be compiled. # Returned a compiled regular expression pattern to use for matching. |