Mercurial > public > finance-parser

--- a/main.py	Tue May 30 20:08:35 2023 +0100
+++ b/main.py	Wed May 31 20:19:26 2023 +0100
@@ -1,8 +1,12 @@
 import json
 from datetime import datetime
+from collections import defaultdict


 def main():
+    data = defaultdict(dict)
+    date_index = defaultdict(dict)
+
     with open('santander.json') as f:
         doc = json.load(f)

@@ -12,59 +16,169 @@
         return

     blocks = doc['Blocks']
-    table = extract_block(blocks, 'BlockType', 'TABLE')
-    table_child_ids = extract_child_ids(table)
-
-    for table_child_id in table_child_ids:
-        cell = extract_block(blocks, 'Id', table_child_id)
-        cell_child_ids = extract_child_ids(cell)
+
+    # Get format
+    lines = filter_blocks(blocks, 'BlockType', 'LINE')
+    for line in lines:
+        format = get_format(line['Text'])
+        data['format'] = format
+        if format:
+            break
+
+    # Find dates value and position
+    cells = filter_blocks(blocks, 'BlockType', 'CELL')
+    for cell in cells:
+        child_ids = extract_child_ids(cell)

-        cell_value = ''
-        for index, cell_child_id in enumerate(cell_child_ids):
-            word_block = extract_block(blocks, 'Id', cell_child_id)
-            cell_value += word_block['Text'].lower()
+        # Get `Text` from `CELL` block
+        cell_text = ''
+        for index, child_id in enumerate(child_ids):
+            word_block = filter_blocks(blocks, 'Id', child_id)[0]
+            cell_text += word_block['Text']
+
+        date_string = is_date(cell_text)
+        if date_string:
+            cell_text = date_string
+            date_index[date_string]['column'] = cell['ColumnIndex']
+            date_index[date_string]['row'] = cell['RowIndex']
+

-            if index < len(cell_child_ids) - 1:
-                cell_value += '_'
+        cell_row_index = cell['RowIndex']
+        cell_column_index = cell['ColumnIndex']
+        data['rows'][cell_row_index][cell_column_index] = cell_text
+
+    # Delete unused rows
+    for year in date_index:
+        for row in data['rows']:
+            print(row)
+            exit()
+            if year[row] < row:
+                del data[row]
+
+    print(data)
+
+
+

-        print(cell_value)
-        print(is_date(cell_value))
+
+    print(data)


+    """
+    # Get table
+    table = filter_blocks(blocks, 'BlockType', 'TABLE')[0]
+    table_child_ids = extract_child_ids(table)
+
+    # Iterate over childs and get `CELL` blocks
+    for table_child_id in table_child_ids:
+        cell = filter_blocks(blocks, 'Id', table_child_id)[0]
+        cell_child_ids = extract_child_ids(cell)
+
+        # Get `Text` from `CELL` block
+        cell_text = ''
+        for cell_child_id in cell_child_ids:
+            word_block = filter_blocks(blocks, 'Id', cell_child_id)[0]
+            cell_text += word_block['Text']
+
+        # Check if cell_text could be a date
+        date_string = is_date(cell_text)
+        if date_string:
+            date_column_index = cell['ColumnIndex']
+            data[date_column_index] = {'year': date_string}
+    """
+
+
+def filter_blocks(blocks, block_key, block_value):
+    """
+    Extract a block by key-value from array of blocks
+    """
+
+    return [block for block in blocks if block[block_key] == block_value]
+
+
 def extract_child_ids(block):
+    """
+    Extract child Ids from a block
+    """
+
     if not 'Relationships' in block:
         return []

     return [r['Ids'] for r in block['Relationships'] if r['Type'] == 'CHILD'][0]


-def extract_block(blocks, block_key, block_value):
-    return [block for block in blocks if block[block_key] == block_value][0]
+def is_date(string_date):
+    """
+    Verify if a string could be a date
+    """

-
-def is_date(string_date):
-    formats_allowed = ['%d-%m-%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y']
+    formats_allowed = ['%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y']

     for format_allowed in formats_allowed:
         try:
-            datetime.strptime(string_date, format_allowed)
+            date = datetime.strptime(string_date, format_allowed)

-            return True
+            return date.strftime("%Y")
         except ValueError:

             # Try removing characters from the beginning and end
             options = [string_date[:-1], string_date[1:], string_date[1:-1]]
             for option in options:
                 try:
-                    datetime.strptime(option, format_allowed)
+                    date = datetime.strptime(option, format_allowed)

-                    return True
+                    return date.strftime("%Y")
                 except ValueError:
                     continue

-    return False
+    return
+
+
+def get_format(phrase):
+    """
+    Given a phrase verify if it is specified the amount format
+    """
+
+    amount_formats = ['thousand', 'million', 'billion']
+
+    for amount_format in amount_formats:
+        plural_amount_format = f'{amount_format}s'
+
+        if amount_format in phrase or plural_amount_format in phrase:
+            return amount_format
+
+
+def clean(string_type, string):
+    characters = ['.', ',', '-', ' ']
+
+    clean_string = string
+    for character in characters:
+        clean_string = clean_string.replace(character, '')
+
+    return clean_string
+
+
+def format_amount(string_amount):
+    pass


 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
+
+"""
+Assumptions:
+- Thousand separator is `,`
+- Supported date formats '%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y'
+- Accounting values are in the same column and below the date.
++-------+-------+
+| 2022  | 2023  |
++-------+-------+
+| 3,000 | 3,100 |
++-------+-------+
+|  120  |  150  |
++-------+-------+
+|  789  |  800  |
++-------+-------+
+- Account names must be in column index 1
+"""
\ No newline at end of file