Mercurial > public > finance-parser
changeset 4:9005b7590008
state machine working
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Mon, 05 Jun 2023 10:13:43 +0100 |
parents | 2e5f3664f3e4 |
children | 2daf0dc08247 |
files | analyze_document/app.py events/analyze_document_event.json events/process_document_event.json events/upload_document_event.json process_document/app.py reports/itx_balance.pdf reports/san_balance.pdf upload_document/app.py |
diffstat | 8 files changed, 68 insertions(+), 51 deletions(-) [+] |
line wrap: on
line diff
--- a/analyze_document/app.py Fri Jun 02 20:12:29 2023 +0100 +++ b/analyze_document/app.py Mon Jun 05 10:13:43 2023 +0100 @@ -12,7 +12,10 @@ event_detail = event['detail'] bucket_name = event_detail['bucket']['name'] object_key = event_detail['object']['key'] - company_ticker = re.search('unprocessed/(.*).pdf', object_key).group(1) + + company_ticker = re.search('unprocessed/(.*)_', object_key).group(1) + doc_type = re.search(f'unprocessed/{company_ticker}_(.*).pdf', object_key).group(1) + file_id = uuid.uuid4() data_dict = textract_client.analyze_document( Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}}, @@ -20,7 +23,7 @@ ) data_string = json.dumps(data_dict, indent=2, default=str) - filename = f'{company_ticker}_{uuid.uuid4()}.json' + filename = f'{company_ticker}_{doc_type}_{file_id}.json' s3_client.put_object( Bucket=bucket_name, @@ -37,6 +40,10 @@ "statusCode": 200, "body": { "message": { + "companyTicker": company_ticker, + "docType": doc_type, + "fileId": file_id, + "fileName": filename, "objectKey": f'analyzed/{filename}', "bucketName": bucket_name }
--- a/events/analyze_document_event.json Fri Jun 02 20:12:29 2023 +0100 +++ b/events/analyze_document_event.json Mon Jun 05 10:13:43 2023 +0100 @@ -15,7 +15,7 @@ "name":"sandbox-finance-parser-data" }, "object":{ - "key":"unprocessed/san.pdf", + "key":"unprocessed/san_balance.pdf", "size":49856, "etag":"0adc595c8f2dbfabb5c4095f1f91b458", "sequencer":"00647A159E6438B1A6"
--- a/events/process_document_event.json Fri Jun 02 20:12:29 2023 +0100 +++ b/events/process_document_event.json Mon Jun 05 10:13:43 2023 +0100 @@ -2,7 +2,7 @@ "statusCode": 200, "body": { "message": { - "objectKey": "analyzed/san_f0799678-a362-4b7f-9fff-c26b0bbf2b15.json", + "objectKey": "analyzed/san_balance_f0799678-a362-4b7f-9fff-c26b0bbf2b15.json", "bucketName": "sandbox-finance-parser-data" } }
--- a/events/upload_document_event.json Fri Jun 02 20:12:29 2023 +0100 +++ b/events/upload_document_event.json Mon Jun 05 10:13:43 2023 +0100 @@ -2,7 +2,7 @@ "statusCode": 200, "body": { "message": { - "objectKey": "processed/san_d7312109-9099-4dd2-a984-55768641b25e.json", + "objectKey": "processed/san_balance_d7312109-9099-4dd2-a984-55768641b25e.json", "bucketName": "sandbox-finance-parser-data" } }
--- a/process_document/app.py Fri Jun 02 20:12:29 2023 +0100 +++ b/process_document/app.py Mon Jun 05 10:13:43 2023 +0100 @@ -8,12 +8,14 @@ def lambda_handler(event, context): - event_message = event['body']['message'] - object_key = event_message['objectKey'] - bucket_name = event_message['bucketName'] + event_msg = event['body']['message'] # Download file from s3 - s3_client.download_file(bucket_name, object_key, '/tmp/document.json') + s3_client.download_file( + event_msg['bucketName'], + event_msg['objectKey'], + '/tmp/document.json' + ) with open('/tmp/document.json') as f: doc = json.load(f) @@ -49,7 +51,7 @@ cell_text += '_' # Verify if `Text` could be a valid date - date_string = is_date(cell_text) + date_string = is_date(clean_text(cell_text, 'date')) if date_string: cell_text = date_string result['dateRow'] = cell['RowIndex'] @@ -57,7 +59,7 @@ cell_row_index = cell['RowIndex'] cell_column_index = cell['ColumnIndex'] - data[cell_row_index][cell_column_index] = clean(cell_text) + data[cell_row_index][cell_column_index] = clean_text(cell_text) try: data[cell_row_index]['type'] = cell['EntityTypes'] @@ -75,12 +77,12 @@ if len(row) > 1: result['data'][row_index] = row - filename = object_key.replace('analyzed/', 'processed/') + object_key = event_msg['objectKey'].replace('analyzed/', 'processed/') data_string = json.dumps(result, indent=2, default=str) s3_client.put_object( - Bucket=bucket_name, - Key=filename, + Bucket=event_msg['bucketName'], + Key=object_key, Body=data_string ) @@ -88,8 +90,12 @@ "statusCode": 200, "body": { "message": { - "objectKey": filename, - "bucketName": bucket_name + "companyTicker": event_msg['companyTicker'], + "docType": event_msg['docType'], + "fileId": event_msg['fileId'], + "fileName": event_msg['fileName'], + "objectKey": object_key, + "bucketName": event_msg['bucketName'] } }, } @@ -106,34 +112,20 @@ def is_date(string_date): """ Verify if a string could be a date. - - -> Funciona pero es un desastre <- """ - formats_allowed = ['%d-%m-%Y', '%d_%m_%Y', '%d/%m/%Y', '%d.%m.%Y', '%Y'] + formats_allowed = ['%d-%m-%Y', '%d/%m/%Y', '%Y'] for format_allowed in formats_allowed: try: date = datetime.strptime(string_date, format_allowed) if date.year > datetime.now().year or date.year < 1900: - return # Fecha fuera de rango + return # Date out of range date return date.strftime("%Y") except ValueError: - - # Try removing characters from the beginning and end - options = [string_date[:-1], string_date[1:], string_date[1:-1]] - for option in options: - try: - date = datetime.strptime(option, format_allowed) - - if date.year > datetime.now().year or date.year < 1900: - return # Fecha fuera de rango - - return date.strftime("%Y") - except ValueError: - continue + continue return @@ -152,14 +144,32 @@ return amount_format -def clean(text): +def clean_text(text, text_type='default'): """" Remove bad characters from word """ - characters = ['.', ',', '-', ' '] + special_chars = [ + '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', + '-', '_', '+', '=', '[', ']', '{', '}', '\\', '|', + ';', ':', '"', '\'', '<', '>', '/', '?', '.', ',' + ] + + if text_type == 'date': + allowed_chars = ['_', '-', '/'] - for character in characters: - text = text.replace(character, '') + # Sometimes date is '2020a' or 'b2020' because indexes + if text[-1].isalpha(): + special_chars.append(text[-1]) + + if text[0].isalpha(): + special_chars.append(text[0]) + else: + allowed_chars = ['.', ',', '-', ' '] + + special_chars = [char for char in special_chars if char not in allowed_chars] + + for char in special_chars: + text = text.replace(char, '') return text.lower()
--- a/upload_document/app.py Fri Jun 02 20:12:29 2023 +0100 +++ b/upload_document/app.py Mon Jun 05 10:13:43 2023 +0100 @@ -1,6 +1,6 @@ import json import boto3 -import re + s3_client = boto3.client('s3') dynamodb = boto3.resource('dynamodb') @@ -8,13 +8,14 @@ def lambda_handler(event, context): - event_message = event['body']['message'] - object_key = event_message['objectKey'] - bucket_name = event_message['bucketName'] - company_ticker = re.search('processed/(.*)_', object_key).group(1) + event_msg = event['body']['message'] # Download file from s3 - s3_client.download_file(bucket_name, object_key, '/tmp/document.json') + s3_client.download_file( + event_msg['bucketName'], + event_msg['objectKey'], + '/tmp/document.json' + ) with open('/tmp/document.json') as f: doc = json.load(f) @@ -28,9 +29,7 @@ column_types = [] """ - The following statement avoids getting a `2020` as the value - of `ASSETS`. - + Given: +------------------+------+------+ | ASSETS | 2020 | 2019 | +------------------+------+------+ @@ -38,6 +37,8 @@ +------------------+------+------+ | ASSETS_ACCOUNT_2 | | | +------------------+------+------+ + + The following statement avoids getting `2020` as the value of `ASSETS`. """ account_value = account[dateColumn] @@ -51,7 +52,7 @@ batch.put_item( Item={ - 'pk': f'balance#{company_ticker}', + 'pk': f"balance#{event_msg['companyTicker']}", 'sk': f'{date}#{row_index}', 'account_name': account['1'], 'account_value': account_value, @@ -60,13 +61,12 @@ ) # pk -> item_type#company_ticker - # sk -> date + # sk -> date#filename table.put_item( Item={ - 'pk': f'file#{company_ticker}', - 'sk': f"{date}", - 'filename': object_key.replace('processed/', '') + 'pk': f"file#{event_msg['companyTicker']}", + 'sk': f"{date}#{event_msg['objectKey'].replace('processed/', '')}" } )