Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

""" 

---------------------------------------------------------------------- 

Authors: Jan-Justin van Tonder 

---------------------------------------------------------------------- 

This file contains the logic for South African ID book (pre 1994) 

context. 

---------------------------------------------------------------------- 

""" 

 

from id_contexts.id_context import FieldType, LineType 

from id_contexts.sa_id import SAID 

from fuzzywuzzy import fuzz 

 

 

class SAIDBookOld(SAID): 

""" 

A class that represents an ID context for a South African ID book (pre 1994). 

It supplies some of the concrete information, such as the match contexts, to the classes higher up in inheritance 

hierarchy and implements abstract methods defined by its parent. 

""" 

def __init__(self): 

""" 

Initialises the SAIDBookOld object. 

""" 

# Specify initial list of contexts for string image_processing when populating 

# the ID information dictionary to send as output. 

match_contexts = [{ 

'field': 'identity_number', 

'find': 'id no', 

'field_type': FieldType.NUMERIC_ONLY, 

'line_type': LineType.TITLED_ADJACENT, 

'multi_line': False 

}, { 

'field': 'surname', 

'find': 'vansurname', 

'field_type': FieldType.TEXT_ONLY, 

'to_uppercase': False, 

'line_type': LineType.TITLED_NEWLINE, 

'multi_line': True, 

'multi_line_end': 'voornameforenames' 

}, { 

'field': 'names', 

'find': 'voornameforenames', 

'field_type': FieldType.TEXT_ONLY, 

'to_uppercase': False, 

'line_type': LineType.TITLED_NEWLINE, 

'multi_line': True, 

'multi_line_end': 'geboortedistrik of-land' 

}, { 

'field': 'sex', 

'find': 'sex', 

'field_type': FieldType.TEXT_ONLY, 

'to_uppercase': False, 

'line_type': LineType.TITLED_NEWLINE, 

'multi_line': False 

}, { 

'field': 'date_of_birth', 

'find': 'geboortedatum', 

'field_type': FieldType.DATE_HYPHENATED, 

'to_uppercase': False, 

'line_type': LineType.TITLED_ADJACENT, 

'multi_line': False 

}, { 

'field': 'country_of_birth', 

'find': 'district or country of birth', 

'field_type': FieldType.TEXT_ONLY, 

'to_uppercase': True, 

'line_type': LineType.TITLED_NEWLINE, 

'multi_line': False 

}, { 

'field': 'status', 

'find': 'saburgersacitizen', 

'field_type': FieldType.TEXT_ONLY, 

'to_uppercase': False, 

'line_type': LineType.UNTITLED_ADJACENT, 

'multi_line': False 

}] 

# Initialise parent 

SAID.__init__(self, match_contexts) 

 

def _get_idiosyncratic_match(self, match_context, id_string_list, current_index): 

""" 

Identifies and returns matches that are specific to the current ID context. 

 

Authors: 

Jan-Justin van Tonder 

 

Args: 

match_context (dict): A dictionary object that provides context for the information that is to be extracted. 

id_string_list (list): An ID string that has been broken down into a list of individual lines. 

current_index (int): The current index within the ID string list. 

 

Returns: 

(str): A string containing the match value of a context-specific case. 

(None): Used to indicate that no special case was identified. 

""" 

# If we are looking for the ID number and the last few characters of the line 

# are numeric, then the ID number is on the same line instead of a new line. 

if match_context['field'] == 'identity_number': 

return id_string_list[current_index] 

# Check for the status special case. 

if match_context['field'] == 'status': 

citizen_match_ratio = fuzz.token_set_ratio(id_string_list[current_index], 'saburgersacitizen') 

non_citizen_match_ratio = fuzz.token_set_ratio(id_string_list[current_index], 'niesaburgernonsacitizen') 

match = 'citizen' if citizen_match_ratio > non_citizen_match_ratio else 'non citizen' 

return match 

# Check for date of birth special case. 

if match_context['field'] == 'date_of_birth': 

return id_string_list[current_index] 

# Otherwise return an empty string to indicate that a special case was not identified. 

return None