Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

""" 

---------------------------------------------------------------------- 

Authors: Jan-Justin van Tonder 

---------------------------------------------------------------------- 

Contains the logic used to verify the extracted text from a form 

of ID. 

---------------------------------------------------------------------- 

""" 

 

import Levenshtein 

from hutts_utils.hutts_logger import logger, prettify_json_message 

 

 

class TextVerify: 

""" 

This class is responsible for the verification of text that is extracted from an ID. 

 

Authors: 

Jan-Justin van Tonder 

""" 

 

def __init__(self): 

""" 

Initialises the TextVerify object. 

 

Authors: 

Jan-Justin van Tonder 

""" 

# Logging for debugging purposes. 

logger.debug('Initialising %s...' % type(self).__name__) 

 

def verify(self, extracted, verifier, threshold=75.00, min_matches=4, verbose=False): 

""" 

This function is responsible for the verification of text that is extracted from an ID and is passed in, 

along with information that is to be used to verify the extracted text. 

 

Args: 

extracted (dict): A dictionary containing the information that was extracted from an ID. 

verifier (dict): A dictionary containing the information against which the extracted data is to be 

verified. 

threshold (float): A threshold percentage (out of 100) that is used to determine whether or not the 

final match percentage is accepted as verified. 

min_matches (int): The minimum number of matches that have to be calculated for the final result to be 

considered as verified. 

verbose (bool): Indicates whether or not to return all of the calculated match percentages. 

 

Returns: 

(bool, float | dict): The first value returned is a bool that indicates whether or not the total 

percentage match is above the specified threshold value, while the second return value is the total 

percentage match value if verbose is False, or returns a dict of all the determined percentage match 

values if verbose is True. 

 

Raises: 

TypeError: If extracted is not a dictionary. 

TypeError: If verifier is not a dictionary. 

TypeError: If threshold is not a float. 

TypeError: If min_matches is not an integer. 

TypeError: If verbose is not a boolean. 

""" 

if not isinstance(extracted, dict): 

raise TypeError( 

'Bad type for arg extracted - expected dict. Received type "%s"' % 

type(extracted).__name__ 

) 

if not isinstance(verifier, dict): 

raise TypeError( 

'Bad type for arg verifier - expected dict. Received type "%s"' % 

type(verifier).__name__ 

) 

if not isinstance(threshold, float): 

raise TypeError( 

'Bad type for arg threshold - expected float. Received type "%s"' % 

type(threshold).__name__ 

) 

if not isinstance(min_matches, int): 

raise TypeError( 

'Bad type for arg min_matches - expected int. Received type "%s"' % 

type(min_matches).__name__ 

) 

if not isinstance(verbose, bool): 

raise TypeError( 

'Bad type for arg verbose - expected bool. Received type "%s"' % 

type(verbose).__name__ 

) 

# Set minimum number of matches, if zero or less set to one. 

min_matches = min_matches if min_matches > 0 else 1 

# Logging for debugging and verbose purposes. 

logger.debug('Threshold for verification set as: %.2f' % threshold) 

logger.debug('Minimum number of matches for verification set as: %d' % min_matches) 

logger.debug('Simplified percentages to be returned' if not verbose else 'Verbose percentages to be returned') 

logger.debug('-' * 50) 

logger.debug('Verifying:') 

logger.debug('-' * 50) 

# Prettify and log the extracted information. 

[logger.debug(log_line) for log_line in prettify_json_message(extracted).split('\n')] 

logger.debug('-' * 50) 

logger.debug('Against:') 

logger.debug('-' * 50) 

# Prettify and log the verifier information. 

[logger.debug(log_line) for log_line in prettify_json_message(verifier).split('\n')] 

logger.debug('-' * 50) 

# Initialise a dictionary to house the final matching percentages. 

match_percentages = {} 

# Iterate over the verifier and calculate a percentage match for the values, 

# if the keys match and the corresponding values exist. 

for key, value in verifier.items(): 

if key in extracted and extracted[key] is not None: 

# Compute the match percentage. 

logger.debug('Computing match "%s" and "%s"...' % (value, extracted[key])) 

match_percentages[key] = { 

'match_percentage': self._match_percentage(value, extracted[key]), 

'verifier_field_value': value, 

'extracted_field_value': extracted[key] 

} 

logger.debug( 

'"%s" and "%s" match percentage: %.2f' % 

(value, extracted[key], match_percentages[key]['match_percentage']) 

) 

else: 

logger.warning('Could not find corresponding field "%s" in extracted information to verify' % key) 

# Determine the number of percentages calculated and initialise a default value for the total match score. 

num_scores = len(match_percentages) 

total_match_percentage = 0.0 

# Check if enough matches were found. 

if num_scores >= min_matches: 

# Calculate the total match score. 

total_match_percentage = self._total_percentage_match(match_percentages) 

# Either the minimum number of percentages criteria was not met. 

else: 

logger.warning('A total of %d matches were found, which is less than the minimum' % num_scores) 

# Determine whether or not the text is verified. 

is_verified = total_match_percentage >= threshold 

# Logging for debugging purposes. 

logger.debug('-' * 50) 

logger.debug('Intermediate match percentages:') 

logger.debug('-' * 50) 

[logger.debug(log_line) for log_line in prettify_json_message(match_percentages).split('\n')] 

logger.debug('-' * 50) 

logger.debug('Final match percentage: %.2f' % total_match_percentage) 

logger.debug('Threshold to pass: %.2f' % threshold) 

logger.debug('Result: ' + 'Passed' if is_verified else 'Failed') 

# Return the final result. 

if not verbose: 

return is_verified, total_match_percentage 

# Append the total and non-matches to the existing percentages for verbose purposes, 

# and return all percentage values. 

match_percentages.update(self._get_non_matches(extracted, verifier)) 

match_percentages['total'] = total_match_percentage 

return is_verified, match_percentages 

 

@staticmethod 

def _match_percentage(str_x, str_y): 

""" 

This function is responsible for determining the percentage match for two strings and returning 

said percentage. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

str_x (str): The first string that is used to perform matching. 

str_y (str): The second string that is used to perform matching. 

 

Returns: 

(float): Match percentage of the two given strings. 

 

Raises: 

TypeError: If str_x is not a string. 

TypeError: If str_y is not a string. 

""" 

if not isinstance(str_x, str): 

raise TypeError( 

'Bad type for arg str_x - expected string. Received type "%s"' % 

type(str_x).__name__ 

) 

if not isinstance(str_y, str): 

raise TypeError( 

'Bad type for arg str_y - expected string. Received type "%s"' % 

type(str_y).__name__ 

) 

return round(Levenshtein.ratio(str_x, str_y) * 100, 2) 

 

@staticmethod 

def _total_percentage_match(matches): 

""" 

This function is responsible for calculating a single, total percentage match value for a dict of match 

values that have been calculated. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

matches (dict): A dictionary of pre-calculated, match percentages. 

 

Returns: 

(float): A total match percentage (out of 100) for a given set of match percentages. 

 

Todo: 

Investigate the proposal of calculating a weighted total. 

""" 

return round(sum(value['match_percentage'] for value in matches.values()) / len(matches), 2) 

 

@staticmethod 

def _get_non_matches(extracted, verifier): 

""" 

Creates a dictionary containing fields for which matches could not be computed, due to non-existence 

of fields or field values. 

 

Author: 

Jan-Justin van Tonder 

 

Args: 

extracted (dict): A dictionary containing the information that was extracted from an ID. 

verifier (dict): A dictionary containing the information against which the extracted data is to be 

verified. 

 

Returns: 

(dict): A dictionary containing fields for which no matches can be found. 

""" 

non_matches = {} 

# Iterate over the extracted and verifier dictionaries to determine the field values for which match 

# percentages cannot be computed due to non-existence of values. 

for (verify_key, verify_value), (extract_key, extract_value) in zip(verifier.items(), extracted.items()): 

# There exists no corresponding field or field value for the verifier in the extracted ID info. 

if verify_key not in extracted or extracted[verify_key] is None: 

non_matches[verify_key] = { 

'match_percentage': None, 

'verifier_field_value': verify_value, 

'extracted_field_value': None 

} 

# There exists no corresponding field or field value for the extracted ID info in the verifier. 

if extract_key not in verifier or verifier[extract_key] is None: 

non_matches[extract_key] = { 

'match_percentage': None, 

'verifier_field_value': None, 

'extracted_field_value': extract_value 

} 

return non_matches 

 

def validate_id_number(self, id_number, valid_length=13): 

""" 

Determines whether a given id number is valid or not. 

 

Args: 

id_number (str): 

valid_length (int): Specifies the length of a given id number to be considered as valid. 

 

Returns: 

(bool): True if the id number is valid, False otherwise. 

 

Raises: 

TypeError: If id_number is not a string containing only numeric characters. 

TypeError: If valid_length is not an integer. 

""" 

if (not isinstance(id_number, str)) or (isinstance(id_number, str) and not id_number.isnumeric()): 

raise TypeError( 

'Bad type for arg id_number - expected string of ONLY numeric characters. Received type "%s"' % 

type(id_number).__name__ 

) 

if not isinstance(valid_length, int): 

raise TypeError( 

'Bad type for arg valid_length - expected integer. Received type "%s"' % 

type(valid_length).__name__ 

) 

# Logging for debugging purposes. 

logger.debug('Checking if extracted id number is valid...') 

# Determine if the id number is of a valid length. 

is_valid_length = len(id_number) == valid_length 

logger.debug('Extracted id number length appears %s' % ('valid' if is_valid_length else 'invalid')) 

# Return early since the result will be false anyways. 

# Do not calculate the checksum if it is not required. 

if not is_valid_length: 

logger.debug('Extracted id number appears invalid') 

return False 

# Determine if the id number checksum is valid. 

is_valid_id_checksum = self._compute_checksum(id_number) == 0 

# Both the length and the checksum must be valid for the entire id number to be valid. 

is_valid_id_number = is_valid_length and is_valid_id_checksum 

# Logging for debugging purposes. 

logger.debug('Extracted id number checksum appears %s' % ('valid' if is_valid_id_checksum else 'invalid')) 

logger.debug('Extracted id number appears %s' % ('valid' if is_valid_id_number else 'invalid')) 

# Return final result of validation. 

return is_valid_id_number 

 

@staticmethod 

def _compute_checksum(id_number): 

""" 

Compute the Luhn checksum for the given id number string for validation. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

id_number (str): A string containing an id number for which the Luhn checksum is to be calculated. 

 

Returns: 

(int): Luhn checksum value for validation. 

""" 

# Map the digits of the given id number to new integers and create a list from said mapping. 

digits = list(map(int, id_number)) 

# Create a sum of the even digits by multiplying each digit by 2, performing mod 10 division and summing 

# the resultant digits. 

even_partial_sum = [sum(divmod(2 * digit, 10)) for digit in digits[-2::-2]] 

even_sum = sum(even_partial_sum) 

# Sum all the odd positioned digits. 

odd_sum = sum(digits[-1::-2]) 

# Return the Luhn checksum value for validation. 

return (even_sum + odd_sum) % 10