Mercurial > public > finance-parser
changeset 6:d15ccf5f1373
fix bug clean_text
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Mon, 05 Jun 2023 17:12:18 +0100 |
parents | 2daf0dc08247 |
children | 7c5fb7573dd8 |
files | analyze_document/app.py events/process_document_event.json events/upload_document_event.json process_document/app.py template.yaml upload_document/app.py |
diffstat | 6 files changed, 37 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/analyze_document/app.py Mon Jun 05 12:48:47 2023 +0100 +++ b/analyze_document/app.py Mon Jun 05 17:12:18 2023 +0100 @@ -15,7 +15,7 @@ company_ticker = re.search('unprocessed/(.*)_', object_key).group(1) doc_type = re.search(f'unprocessed/{company_ticker}_(.*).pdf', object_key).group(1) - file_id = uuid.uuid4() + file_id = str(uuid.uuid4()) data_dict = textract_client.analyze_document( Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}},
--- a/events/process_document_event.json Mon Jun 05 12:48:47 2023 +0100 +++ b/events/process_document_event.json Mon Jun 05 17:12:18 2023 +0100 @@ -2,7 +2,11 @@ "statusCode": 200, "body": { "message": { - "objectKey": "analyzed/san_balance_f0799678-a362-4b7f-9fff-c26b0bbf2b15.json", + "companyTicker": "san", + "docType": "balance", + "fileId": "50d30446-015c-47bd-bb33-af487120c0d5", + "fileName": "san_balance_50d30446-015c-47bd-bb33-af487120c0d5.json", + "objectKey": "analyzed/san_balance_50d30446-015c-47bd-bb33-af487120c0d5.json", "bucketName": "sandbox-finance-parser-data" } }
--- a/events/upload_document_event.json Mon Jun 05 12:48:47 2023 +0100 +++ b/events/upload_document_event.json Mon Jun 05 17:12:18 2023 +0100 @@ -2,7 +2,11 @@ "statusCode": 200, "body": { "message": { - "objectKey": "processed/san_balance_d7312109-9099-4dd2-a984-55768641b25e.json", + "companyTicker": "san", + "docType": "balance", + "fileId": "50d30446-015c-47bd-bb33-af487120c0d5", + "fileName": "san_balance_50d30446-015c-47bd-bb33-af487120c0d5.json", + "objectKey": "processed/san_balance_50d30446-015c-47bd-bb33-af487120c0d5.json", "bucketName": "sandbox-finance-parser-data" } }
--- a/process_document/app.py Mon Jun 05 12:48:47 2023 +0100 +++ b/process_document/app.py Mon Jun 05 17:12:18 2023 +0100 @@ -51,7 +51,7 @@ cell_text += '_' # Verify if `Text` could be a valid date - date_string = is_date(clean_text(cell_text, 'date')) + date_string = is_date(cell_text) if date_string: cell_text = date_string result['dateRow'] = cell['RowIndex'] @@ -121,11 +121,23 @@ date = datetime.strptime(string_date, format_allowed) if date.year > datetime.now().year or date.year < 1900: - return # Date out of range date + return # Fecha fuera de rango return date.strftime("%Y") except ValueError: - continue + + # Try removing characters from the beginning and end + options = [string_date[:-1], string_date[1:], string_date[1:-1]] + for option in options: + try: + date = datetime.strptime(option, format_allowed) + + if date.year > datetime.now().year or date.year < 1900: + return # Fecha fuera de rango + + return date.strftime("%Y") + except ValueError: + continue return @@ -157,15 +169,8 @@ if text_type == 'date': allowed_chars = ['_', '-', '/'] - - # Sometimes date is '2020a' or 'b2020' because indexes - if text[-1].isalpha(): - special_chars.append(text[-1]) - - if text[0].isalpha(): - special_chars.append(text[0]) else: - allowed_chars = ['.', ',', '-', ' '] + allowed_chars = ['_'] special_chars = [char for char in special_chars if char not in allowed_chars]
--- a/template.yaml Mon Jun 05 12:48:47 2023 +0100 +++ b/template.yaml Mon Jun 05 17:12:18 2023 +0100 @@ -1,6 +1,6 @@ AWSTemplateFormatVersion: '2010-09-09' Transform: AWS::Serverless-2016-10-31 -Description: Serverless balance sheet analyzer using Textract and a serverless API +Description: Serverless finance staments analyzer using Textract and a serverless API Conditions: CreateProdResources: !Equals @@ -168,4 +168,9 @@ - AttributeName: pk AttributeType: S - AttributeName: sk - AttributeType: S \ No newline at end of file + AttributeType: S + +Outputs: + GetReportEndpoint: + Description: "Endpoint to get a report" + Value: !Sub "https://${Api}.execute-api.${AWS::Region}.amazonaws.com/Prod/report" \ No newline at end of file
--- a/upload_document/app.py Mon Jun 05 12:48:47 2023 +0100 +++ b/upload_document/app.py Mon Jun 05 17:12:18 2023 +0100 @@ -56,7 +56,8 @@ 'sk': f'{date}#{row_index}', 'account_name': account['1'], 'account_value': account_value, - 'column_types': column_types + 'column_types': column_types, + 'format': doc['format'] } ) @@ -65,7 +66,7 @@ table.put_item( Item={ - 'pk': f"file#{event_msg['companyTicker']}", + 'pk': f"file#balance#{event_msg['companyTicker']}", 'sk': f"{date}#{event_msg['objectKey'].replace('processed/', '')}" } )