# HG changeset patch # User Dennis C. M. # Date 1685981538 -3600 # Node ID d15ccf5f137314f9ab4acbb1c6cc464ec5e69d99 # Parent 2daf0dc08247ba0a3dd33e6d0b910077aefc565c fix bug clean_text diff -r 2daf0dc08247 -r d15ccf5f1373 analyze_document/app.py --- a/analyze_document/app.py Mon Jun 05 12:48:47 2023 +0100 +++ b/analyze_document/app.py Mon Jun 05 17:12:18 2023 +0100 @@ -15,7 +15,7 @@ company_ticker = re.search('unprocessed/(.*)_', object_key).group(1) doc_type = re.search(f'unprocessed/{company_ticker}_(.*).pdf', object_key).group(1) - file_id = uuid.uuid4() + file_id = str(uuid.uuid4()) data_dict = textract_client.analyze_document( Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}}, diff -r 2daf0dc08247 -r d15ccf5f1373 events/process_document_event.json --- a/events/process_document_event.json Mon Jun 05 12:48:47 2023 +0100 +++ b/events/process_document_event.json Mon Jun 05 17:12:18 2023 +0100 @@ -2,7 +2,11 @@ "statusCode": 200, "body": { "message": { - "objectKey": "analyzed/san_balance_f0799678-a362-4b7f-9fff-c26b0bbf2b15.json", + "companyTicker": "san", + "docType": "balance", + "fileId": "50d30446-015c-47bd-bb33-af487120c0d5", + "fileName": "san_balance_50d30446-015c-47bd-bb33-af487120c0d5.json", + "objectKey": "analyzed/san_balance_50d30446-015c-47bd-bb33-af487120c0d5.json", "bucketName": "sandbox-finance-parser-data" } } diff -r 2daf0dc08247 -r d15ccf5f1373 events/upload_document_event.json --- a/events/upload_document_event.json Mon Jun 05 12:48:47 2023 +0100 +++ b/events/upload_document_event.json Mon Jun 05 17:12:18 2023 +0100 @@ -2,7 +2,11 @@ "statusCode": 200, "body": { "message": { - "objectKey": "processed/san_balance_d7312109-9099-4dd2-a984-55768641b25e.json", + "companyTicker": "san", + "docType": "balance", + "fileId": "50d30446-015c-47bd-bb33-af487120c0d5", + "fileName": "san_balance_50d30446-015c-47bd-bb33-af487120c0d5.json", + "objectKey": "processed/san_balance_50d30446-015c-47bd-bb33-af487120c0d5.json", "bucketName": "sandbox-finance-parser-data" } } diff -r 2daf0dc08247 -r d15ccf5f1373 process_document/app.py --- a/process_document/app.py Mon Jun 05 12:48:47 2023 +0100 +++ b/process_document/app.py Mon Jun 05 17:12:18 2023 +0100 @@ -51,7 +51,7 @@ cell_text += '_' # Verify if `Text` could be a valid date - date_string = is_date(clean_text(cell_text, 'date')) + date_string = is_date(cell_text) if date_string: cell_text = date_string result['dateRow'] = cell['RowIndex'] @@ -121,11 +121,23 @@ date = datetime.strptime(string_date, format_allowed) if date.year > datetime.now().year or date.year < 1900: - return # Date out of range date + return # Fecha fuera de rango return date.strftime("%Y") except ValueError: - continue + + # Try removing characters from the beginning and end + options = [string_date[:-1], string_date[1:], string_date[1:-1]] + for option in options: + try: + date = datetime.strptime(option, format_allowed) + + if date.year > datetime.now().year or date.year < 1900: + return # Fecha fuera de rango + + return date.strftime("%Y") + except ValueError: + continue return @@ -157,15 +169,8 @@ if text_type == 'date': allowed_chars = ['_', '-', '/'] - - # Sometimes date is '2020a' or 'b2020' because indexes - if text[-1].isalpha(): - special_chars.append(text[-1]) - - if text[0].isalpha(): - special_chars.append(text[0]) else: - allowed_chars = ['.', ',', '-', ' '] + allowed_chars = ['_'] special_chars = [char for char in special_chars if char not in allowed_chars] diff -r 2daf0dc08247 -r d15ccf5f1373 template.yaml --- a/template.yaml Mon Jun 05 12:48:47 2023 +0100 +++ b/template.yaml Mon Jun 05 17:12:18 2023 +0100 @@ -1,6 +1,6 @@ AWSTemplateFormatVersion: '2010-09-09' Transform: AWS::Serverless-2016-10-31 -Description: Serverless balance sheet analyzer using Textract and a serverless API +Description: Serverless finance staments analyzer using Textract and a serverless API Conditions: CreateProdResources: !Equals @@ -168,4 +168,9 @@ - AttributeName: pk AttributeType: S - AttributeName: sk - AttributeType: S \ No newline at end of file + AttributeType: S + +Outputs: + GetReportEndpoint: + Description: "Endpoint to get a report" + Value: !Sub "https://${Api}.execute-api.${AWS::Region}.amazonaws.com/Prod/report" \ No newline at end of file diff -r 2daf0dc08247 -r d15ccf5f1373 upload_document/app.py --- a/upload_document/app.py Mon Jun 05 12:48:47 2023 +0100 +++ b/upload_document/app.py Mon Jun 05 17:12:18 2023 +0100 @@ -56,7 +56,8 @@ 'sk': f'{date}#{row_index}', 'account_name': account['1'], 'account_value': account_value, - 'column_types': column_types + 'column_types': column_types, + 'format': doc['format'] } ) @@ -65,7 +66,7 @@ table.put_item( Item={ - 'pk': f"file#{event_msg['companyTicker']}", + 'pk': f"file#balance#{event_msg['companyTicker']}", 'sk': f"{date}#{event_msg['objectKey'].replace('processed/', '')}" } )