Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

""" 

---------------------------------------------------------------------- 

Authors: Jan-Justin van Tonder 

---------------------------------------------------------------------- 

This file contains the abstraction of all ID contexts, which contain 

the necessary information and settings specific to a particular ID 

document type. 

---------------------------------------------------------------------- 

""" 

 

import re 

from abc import ABC, abstractmethod 

from enum import Enum 

from hutts_utils.hutts_logger import logger 

 

 

class IDContext(ABC): 

""" 

This class is an abstraction of all ID contexts. 

The ID contexts serve to contain the necessary information and settings specific to a particular ID 

document type for operations such as extracting information from an ID OCR string. 

 

Attributes: 

match_contexts (list): A list of dictionaries that contain the contextual information used in the process of 

retrieving field values from the OCR output string. 

e.g. { 

'field': 'surname', // The field name - can be set to any string one desires. 

'find': 'surname', // A string to be used for matching field names. 

// in the OCR output string (used to know what to look for). 

'field_type': // Indicates if the field value is to be treated as alphanumeric or 

FieldType.TEXT_ONLY // just numeric or just alphabetical characters. 

// (e.g. indicates that all numbers from field value should be removed 

// if the field type is TEXT_ONLY). 

'line_type': TITLED_NEWLINE // Indicates the type of line to be considered when looking for the 

// field value relative to the 'find' value. 

// (e.g. TITLED_NEWLINE indicates that the field value is preceded 

// by a field name/title and a newline). 

'multi_line': True, // Indicates that the field value spans multiple lines. 

'multi_line_end': 'names' // (Optional, unless multi_line is true) A string identifying the next 

// field name that indicates the end of the multi-line field value. 

'to_uppercase': False, // (Optional) Indicates that the retrieved field value must be 

// converted to uppercase. 

} 

""" 

def __init__(self, match_contexts): 

""" 

Responsible for initialising the IDContext object. 

 

Args: 

match_contexts (list): A list of dictionaries that contain the contextual information used in the process 

of retrieving field values from the OCR output string. 

""" 

# Logging for debugging purposes. 

logger.debug('Initialising %s...' % type(self).__name__) 

# Assign match contexts 

self._match_contexts = match_contexts 

 

def get_id_info(self, id_string, barcode_data=None, ignore_fields=None, fuzzy_min_ratio=60.0, max_multi_line=2): 

""" 

Responsible for filtering undesirable fields to be retrieved as well as delegating the responsibility 

of extracting ID information from an OCR string and housing said information in a convenient dictionary. 

Some type checking is done to reduce the likelihood of errors further down the call stack. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

id_string (str): A string containing some ID information. 

barcode_data (dict, Optional): A dictionary object containing information extracted from a barcode. 

ignore_fields (list, Optional): A list containing fields which are to be ignored during extraction. 

fuzzy_min_ratio (float): The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing 

two strings. 

max_multi_line (int): Specifies the maximum number of lines that is to be extracted from fields that are 

noted as running onto multiple lines. 

 

Returns: 

(dict): A dictionary object containing the relevant, extracted ID information. 

 

Raises: 

TypeError: If id_string is not a string. 

TypeError: If barcode_data is not a dictionary. 

""" 

# Check if arguments passed in are the correct type. 

if not isinstance(id_string, str): 

raise TypeError( 

'Bad type for arg id_string - expected string. Received type "%s".' % 

type(id_string).__name__ 

) 

if barcode_data and not isinstance(barcode_data, dict): 

raise TypeError( 

'Bad type for arg barcode_data - expected dictionary. Received type "%s".' % 

type(barcode_data).__name__ 

) 

if ignore_fields and (not isinstance(ignore_fields, list) or not isinstance(ignore_fields[0], str)): 

raise TypeError( 

'Bad type for arg ignore_fields - expected list of strings. Received type "%s".' % 

type(ignore_fields).__name__ 

) 

if not isinstance(fuzzy_min_ratio, float): 

raise TypeError( 

'Bad type for arg fuzzy_min_ratio - expected float. Received type "%s".' % 

type(fuzzy_min_ratio).__name__ 

) 

if not isinstance(max_multi_line, int): 

raise TypeError( 

'Bad type for arg max_multi_line - expected int. Received type "%s".' % 

type(max_multi_line).__name__ 

) 

# Initialise a match context list for extraction. 

match_contexts = self._match_contexts[:] 

# Check if filtering is necessary. 

if ignore_fields is not None: 

# Filter out the fields that are to be ignored. 

match_contexts = self._filter_ignore_match_contexts(ignore_fields) 

# Extract ID information and house it in a dictionary, which is returned. 

return self._dictify(match_contexts, id_string, barcode_data, fuzzy_min_ratio, max_multi_line) 

 

def _filter_ignore_match_contexts(self, ignore_fields): 

""" 

Filters out fields which are to be ignored from the match_contexts. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

ignore_fields (list): A list containing fields which are to be ignored during extraction. 

 

Returns: 

(dict): A filtered list of match contexts. 

""" 

filtered_match_contexts = [] 

for match_context in self._match_contexts: 

if match_context['field'] not in ignore_fields: 

filtered_match_contexts.append(match_context) 

return filtered_match_contexts 

 

@abstractmethod 

def _dictify(self, match_contexts, id_string, barcode_data, fuzzy_min_ratio, max_multi_line): 

""" 

Abstract method for subclasses to implement. 

Meant to extract ID information from a string and, possibly, barcode data, which is to be returned 

in a convenient dictionary format. 

 

Args: 

match_contexts (list): A list of dictionaries that contain the contextual information used in the process 

of retrieving field values from the OCR output string. 

id_string (str): A string containing some ID information. 

barcode_data (dict, Optional): A dictionary object containing information extracted from a barcode. 

fuzzy_min_ratio (float): The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing 

two strings. 

max_multi_line (int): Specifies the maximum number of lines that is to be extracted from fields that are 

noted as running onto multiple lines. 

""" 

pass 

 

@staticmethod 

def _normalise_match(match_context, match): 

""" 

Normalises a given match string according to the context it was matched. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

match_context (dict): A dictionary object that provides context for the information that is to be extracted. 

match (str): A string containing matched ID information. 

 

Returns: 

(str): A match string normalised according to its matched context. 

""" 

# If the field value should only be text, strip everything that is numeric. 

if match_context['field_type'] == FieldType.TEXT_ONLY: 

match = re.sub(r'[\d]', '', match) 

# If the field value ought to be numeric only, strip everything that is not numeric. 

elif match_context['field_type'] == FieldType.NUMERIC_ONLY: 

match = re.sub(r'[^\d]', '', match) 

elif match_context['field_type'] == FieldType.DATE_HYPHENATED: 

match = re.sub(r'[^\d-]', '', match) 

# Check if conversion to uppercase was specified. 

if 'to_uppercase' in match_context and match_context['to_uppercase']: 

match = match.upper() 

# If the field value does not require to be converted to uppercase. 

elif 'to_uppercase' in match_context and not match_context['to_uppercase']: 

# Convert to lowercase and capitalise the character of each new word. 

match = match.lower().title() 

return match 

 

 

class FieldType(Enum): 

""" 

An enumerator used to specify the field type for extracted ID information. 

""" 

TEXT_ONLY = 0 

NUMERIC_ONLY = 1 

MIXED = 2 

DATE_HYPHENATED = 3 

 

 

class LineType(Enum): 

""" 

An enumerator used to specify the line type for extracted ID information. 

""" 

TITLED_NEWLINE = 0 

TITLED_ADJACENT = 1 

UNTITLED_NEWLINE = 2 

UNTITLED_ADJACENT = 3