Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

""" 

---------------------------------------------------------------------- 

Authors: Stephan Nell, Marno Hermann, Jan-Justin van Tonder 

---------------------------------------------------------------------- 

This file contains the abstraction and high-level logic of South 

African ID contexts. 

---------------------------------------------------------------------- 

""" 

 

import re 

from abc import abstractmethod 

from fuzzywuzzy import fuzz 

from datetime import datetime 

from id_contexts.id_context import IDContext, LineType 

from hutts_utils.hutts_logger import logger 

 

 

class SAID(IDContext): 

""" 

An abstract class for South African IDs. 

Contains the high-level logic that is relevant to all South African IDs. 

""" 

# Define a class-level constant for a minimum fuzzy ratio during post-processing. 

POST_PROCESS_MIN_FUZZY_RATIO = 70.0 

 

# Define a class-level constant for a valid ID number length. 

VALID_ID_LENGTH = 13 

 

# Define a class-level constant as a minimum age year delta for year threshold. 

MIN_AGE_DELTA = 15 

 

# Define a class-level constant as a year delta for year threshold. 

YEAR_DELTA = 100 

 

def __init__(self, match_contexts): 

""" 

Initialises the SAID object. 

 

Args: 

match_contexts (list): A list of dictionaries that contain the contextual information used in the process 

of retrieving field values from the OCR output string. 

""" 

# Logging for debugging purposes. 

logger.debug('Initialising %s...' % type(self).__name__) 

# Initialise parent. 

IDContext.__init__(self, match_contexts) 

 

def _dictify(self, match_contexts, id_string, barcode_data, fuzzy_min_ratio, max_multi_line): 

""" 

This function is responsible for generating a dictionary object containing the relevant ID information, 

such as names, surname, ID number, etc., from a given input string containing said relevant information. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

match_contexts (list): A list of dictionaries that contain the contextual information used in the process 

of retrieving field values from the OCR output string. 

id_string (str): A string containing some ID information. 

barcode_data (dict, Optional): A dictionary object containing information extracted from a barcode. 

fuzzy_min_ratio (float): The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing 

two strings. 

max_multi_line (int): Specifies the maximum number of lines that is to be extracted from fields that are 

noted as running onto multiple lines. 

 

Returns: 

(dict): A dictionary object containing the relevant, extracted ID information. 

""" 

# Given a string containing extracted ID text, 

# create a dictionary object and populate it with 

# relevant information from said text. 

id_info = {} 

# Check if barcode data, containing the id number, exists and 

# if so, save it and extract some relevant information from it. 

# It should overwrite any existing fields that can be extracted from the id number, since 

# the information embedded within the id number is more reliable, at least theoretically. 

if barcode_data: 

logger.debug('Extracting details from barcode data...') 

id_info['identity_number'] = barcode_data['identity_number'] 

self._id_number_information_extraction(match_contexts, id_info, barcode_data['identity_number']) 

# Attempt to populate id_info with information from the given ID string. 

logger.debug('Extracting details from the given text string...') 

self._populate_id_information(match_contexts, id_string, id_info, fuzzy_min_ratio, max_multi_line) 

# Perform some custom post-processing on the information that was extracted. 

logger.debug('Post-processing some field values...') 

self._post_process(id_info) 

# Return the info that was found. 

return id_info 

 

def _populate_id_information(self, match_contexts, id_string, id_info, fuzzy_min_ratio, max_multi_line): 

""" 

This function is responsible for populating a dictionary object with information that it is able to find 

and extract from a given string containing ID information. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

id_string (str): A string containing some ID information. 

id_info (dict): A dictionary object used to house extracted ID information. 

fuzzy_min_ratio (float): The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing 

two strings. 

max_multi_line (int): Specifies the maximum number of lines that is to be extracted from fields that are 

noted as running onto multiple lines. 

""" 

# Split the id_string on the newline character to generate a list. 

id_string_list = id_string.split('\n') 

# Attempt to retrieve matches. 

for match_context in match_contexts: 

# Extract desired field name from context as key. 

key = match_context['field'] 

# Only retrieve information if it does not exist or it could not previously 

# be determined. 

if key not in id_info or not id_info[key]: 

id_info[key] = self._get_match(id_string_list, match_context, fuzzy_min_ratio, max_multi_line) 

# If the ID number has been retrieved, use it to extract other useful information. 

# It should overwrite any existing fields that can be extracted from the id number, since 

# the information embedded within the id number is more reliable, at least theoretically. 

if key == 'identity_number' and id_info[key]: 

self._id_number_information_extraction(match_contexts, id_info, id_info[key]) 

 

def _get_match(self, id_string_list, match_context, fuzzy_min_ratio, max_multi_line): 

""" 

This function is responsible for searching through a list of lines from an ID string and extracting the 

relevant ID information based on some context for image_processing that is provided as input. Fuzzy string 

matching is performed on field names in order to extract field values. This process is assisted with a context 

that is is to be provided. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

id_string_list (list): An ID string that has been broken down into a list of individual lines. 

match_context (dict): A dictionary object that provides context for the information that is to be extracted. 

fuzzy_min_ratio (float): The threshold ratio for a minimum, acceptable ratio of fuzziness when comparing 

two strings. 

max_multi_line (int): Specifies the maximum number of lines that is to be extracted from fields that are 

noted as running onto multiple lines. 

e.g. Given OCR output such as : 

... 

Names\n 

This is a long\n 

long list of names\n 

that spans multiple\n 

lines\n 

... 

max_multi_line = 2, means that only the string: 

"This is a long list of names" is retrieved. 

 

Returns: 

(str): A string containing the extracted information, if a match was found. 

(None): If nothing was matched or an extracted value is an empty string. 

""" 

best_match_ratio = fuzzy_min_ratio 

match = None 

skip_to_index = -1 

id_num_lines = len(id_string_list) 

# Set the most suitable fuzzy matching function based on line type. 

get_match_ratio = fuzz.token_set_ratio 

if match_context['line_type'] == LineType.TITLED_ADJACENT: 

get_match_ratio = fuzz.partial_token_set_ratio 

# Iterate over the id_string list to find fuzzy matches. 

for current_index, current_line in enumerate(id_string_list): 

# Check to see if we can jump ahead and ignore the current index. 

if skip_to_index > current_index: 

continue 

# Is there a match? 

match_ratio = get_match_ratio(current_line, match_context['find']) 

if match_ratio >= best_match_ratio: 

# Set new best match ratio and retrieve the info if possible 

best_match_ratio = match_ratio 

# Check for special cases of extraction. 

idiosyncratic_match = self._get_idiosyncratic_match(match_context, id_string_list, current_index) 

if idiosyncratic_match is not None: 

match = idiosyncratic_match 

# Check to see if we are dealing with a field value on single line adjacent to the field name. 

elif match_context['line_type'] == LineType.TITLED_ADJACENT: 

match = re.sub(match_context['find'], '', current_line).strip() 

# Check to see if we are going out of bounds of the string before proceeding. 

elif match_context['line_type'] == LineType.TITLED_NEWLINE and current_index + 1 < id_num_lines: 

# We are only interested in field value, not field name. 

# e.g: Surname\n 

# Smith\n 

# ... 

# ignore 'Surname' so as to be able to manually specify field name 

# in the context settings. 

# Retrieve the field value on the very next line. 

match = id_string_list[current_index + 1] 

# If the field value exists over multiple lines. 

if match_context['multi_line']: 

# Determine the lower bound index for field values that span multiple lines. 

lower_index = current_index + 2 

if lower_index >= id_num_lines: 

# There is nothing to find in this case. 

continue 

# Determine the upper bound index for field values that span multiple lines. 

upper_index = current_index + max_multi_line + 1 

if upper_index > id_num_lines: 

# Don't go out of bounds. 

upper_index = id_num_lines 

# Iterate ahead to retrieve the field value that spans over multiple lines. 

for forward_index in range(lower_index, upper_index): 

# For ech of the specified endpoints, check if the end of the field value has 

# been reached. 

end_point_ratio = fuzz.token_set_ratio( 

match_context['multi_line_end'], 

id_string_list[forward_index] 

) 

if end_point_ratio >= fuzzy_min_ratio: 

skip_to_index = forward_index 

break 

# Otherwise, add the line to the field value. 

match += ' %s' % id_string_list[forward_index].strip() 

# Check if a legitimate match was found before proceeding. 

if not match: 

continue 

# Normalise the match found. 

match = self._normalise_match(match_context, match) 

# Final check to see if an empty string ('', not None) is the match found, return None if this is the case. 

if not match: 

return None 

# Otherwise return what we have found. 

return match 

 

@abstractmethod 

def _get_idiosyncratic_match(self, match_context, id_string_list, current_index): 

""" 

Abstract method to be implemented by subclasses. 

Meant to retrieve matches that are particular to a context of a subclass. 

 

Args: 

match_context (dict): A dictionary object that provides context for the information that is to be extracted. 

id_string_list (list): An ID string that has been broken down into a list of individual lines. 

current_index (int): The current index within the ID string list. 

""" 

pass 

 

@staticmethod 

def _id_number_information_extraction(match_contexts, id_info, id_number): 

""" 

This function is responsible for extracting information from a given ID number and populating a given 

dictionary object with the extracted information. 

 

Authors: 

Marno Hermann 

Stephan Nell 

Jan-Justin van Tonder 

 

Args: 

match_contexts (list): A list of dictionaries that contain the contextual information used in the process 

of retrieving field values from the OCR output string. 

id_info (dict): A dictionary object containing extracted ID information. 

id_number (str): An ID number. 

""" 

for match_context in match_contexts: 

if match_context['field'] == 'date_of_birth': 

# Extract date of birth digits from ID number. 

yy = id_number[:2] 

mm = id_number[2:4] 

dd = id_number[4:6] 

# Populate id_info with date of birth. 

date_of_birth = '%s-%s-%s' % (yy, mm, dd) 

id_info['date_of_birth'] = date_of_birth 

if match_context['field'] == 'sex': 

# Extract gender digit from ID Number. 

gender_digit = id_number[6:7] 

# Populate id_info with gender info. 

# Currently, the genders on South African IDs are binary, meaning an individual is 

# either male or female. 

id_info['sex'] = 'F' if gender_digit < '5' else 'M' 

if match_context['field'] == 'status': 

# Extract status digit from ID Number. 

status_digit = id_number[10:11] 

# Populate id_info with status info. 

id_info['status'] = 'Citizen' if status_digit == '0' else 'Non Citizen' 

 

def _post_process(self, id_info): 

""" 

Used to perform custom processing after extraction has taken place. 

All custom operations that are required after all the extraction has taken place, should be 

called from within this function. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

id_info (dict): A dictionary object used to house extracted ID information. 

 

Returns: 

(dict): The original id_info, with some post-processed field values. 

""" 

# Check if date of birth field exists for post-processing. 

if 'date_of_birth' in id_info and id_info['date_of_birth']: 

id_info['date_of_birth'] = self._standardise_date_of_birth(id_info['date_of_birth']) 

# Check if country of birth field exists for post-processing. 

if 'country_of_birth' in id_info and id_info['country_of_birth']: 

fuzz_ratio = fuzz.token_set_ratio(id_info['country_of_birth'], 'SUID-AFRIKA') 

# We should be fairly certain, with a margin for error, that we have a match. 

if fuzz_ratio >= SAID.POST_PROCESS_MIN_FUZZY_RATIO: 

# Translate from Afrikaans to English 

id_info['country_of_birth'] = 'South Africa' 

 

@staticmethod 

def _standardise_date_of_birth(date_of_birth): 

""" 

Standardises the date of birth field value due to a mixture of formats that can be extracted. 

Due to the preference of extracting the date of birth from the id number as opposed to 

the ocr output, there tends to be a discrepancy in the date format retrieved, therefore, 

standardise it for future use. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

date_of_birth (str): The date of birth to be standardised. 

 

Returns: 

(str): A standardised date of birth field value if the extracted format could be parsed, else the 

extracted format is kept. 

""" 

try: 

# Attempt to parse the different dates that could appear for formatting. 

current_date_of_birth = re.sub(' ', '', date_of_birth) 

# If the current date contains a '-', then it was extracted from the id number and '-' is the 

# third character in, parse it in the format 'YY-MM-DD' 

if '-' in current_date_of_birth and current_date_of_birth.index('-') == 2: 

standardised_date_of_birth = datetime.strptime(current_date_of_birth, '%y-%m-%d') 

# If the current date contains a '-', then it was extracted from the id number, parse it in the 

# format 'YYYY-MM-DD' based on elimination of possibilities for this specific ID context. 

elif '-' in current_date_of_birth: 

standardised_date_of_birth = datetime.strptime(current_date_of_birth, '%Y-%m-%d') 

# Otherwise it was extracted from the OCR output, therefore, parse it in the 

# format 'DD MMM YYYY' 

else: 

standardised_date_of_birth = datetime.strptime(current_date_of_birth, '%d%b%Y') 

# Check to see if the year is too far in the future. 

threshold_year = datetime.now().year - SAID.MIN_AGE_DELTA 

338 ↛ 340line 338 didn't jump to line 340, because the condition on line 338 was never true if standardised_date_of_birth.year >= threshold_year: 

# Reduce the year by a defined year delta. 

replacement_year = standardised_date_of_birth.year - SAID.YEAR_DELTA 

standardised_date_of_birth = standardised_date_of_birth.replace(year=replacement_year) 

# Standardise the date by formatting it according to ISO date format standard, 

# which is 'YYYY-MM-DD' 

return datetime.strftime(standardised_date_of_birth, '%Y-%m-%d') 

except ValueError: 

# Could not parse the date so log and keep it as is. 

logger.warning('Could not parse date "%s" for formatting. Keeping date as is.' % date_of_birth) 

return date_of_birth 

 

def validate_id_number(self, id_number): 

""" 

Determines whether a given id number is valid or not. 

 

Args: 

id_number (str): An ID number that is to be validated. 

 

Returns: 

(bool): True if the id number is valid, False otherwise. 

 

Raises: 

TypeError: If id_number is not a string containing only numeric characters. 

""" 

if (not isinstance(id_number, str)) or (isinstance(id_number, str) and not id_number.isnumeric()): 

raise TypeError( 

'Bad type for arg id_number - expected string of ONLY numeric characters. Received type "%s"' % 

type(id_number).__name__ 

) 

# Logging for debugging purposes. 

logger.debug('Checking if ID number is valid...') 

# Determine if the id number is of a valid length. 

is_valid_length = len(id_number) == SAID.VALID_ID_LENGTH 

logger.debug('ID number length appears %s' % ('valid' if is_valid_length else 'invalid')) 

# Return early since the result will be false anyways. 

# Do not calculate the checksum if it is not required. 

if not is_valid_length: 

logger.debug('ID number appears invalid') 

return False 

# Determine if the id number checksum is valid. 

is_valid_id_checksum = self._compute_checksum(id_number) == 0 

# Both the length and the checksum must be valid for the entire id number to be valid. 

is_valid_id_number = is_valid_length and is_valid_id_checksum 

# Logging for debugging purposes. 

logger.debug('ID number checksum appears %s' % ('valid' if is_valid_id_checksum else 'invalid')) 

logger.debug('ID number appears %s' % ('valid' if is_valid_id_number else 'invalid')) 

# Return final result of validation. 

return is_valid_id_number 

 

@staticmethod 

def _compute_checksum(id_number): 

""" 

Compute the Luhn checksum for the given id number string for validation. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

id_number (str): A string containing an id number for which the Luhn checksum is to be calculated. 

 

Returns: 

(int): Luhn checksum value for validation. 

""" 

# Create a list of ID number digits parsed as integers. 

digits = [int(digit) for digit in id_number] 

# Create a sum of the even digits by multiplying each digit by 2, performing mod 10 division and summing 

# the resultant digits. 

even_partial_sum = [sum(divmod(2 * digit, 10)) for digit in digits[-2::-2]] 

even_sum = sum(even_partial_sum) 

# Sum all the odd positioned digits. 

odd_sum = sum(digits[-1::-2]) 

# Return the Luhn checksum value for validation. 

return (even_sum + odd_sum) % 10