changeset 6:d15ccf5f1373

fix bug clean_text
author Dennis C. M. <dennis@denniscm.com>
date Mon, 05 Jun 2023 17:12:18 +0100
parents 2daf0dc08247
children 7c5fb7573dd8
files analyze_document/app.py events/process_document_event.json events/upload_document_event.json process_document/app.py template.yaml upload_document/app.py
diffstat 6 files changed, 37 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- a/analyze_document/app.py	Mon Jun 05 12:48:47 2023 +0100
+++ b/analyze_document/app.py	Mon Jun 05 17:12:18 2023 +0100
@@ -15,7 +15,7 @@
 
     company_ticker = re.search('unprocessed/(.*)_', object_key).group(1)
     doc_type = re.search(f'unprocessed/{company_ticker}_(.*).pdf', object_key).group(1)
-    file_id = uuid.uuid4()
+    file_id = str(uuid.uuid4())
 
     data_dict = textract_client.analyze_document(
         Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}},
--- a/events/process_document_event.json	Mon Jun 05 12:48:47 2023 +0100
+++ b/events/process_document_event.json	Mon Jun 05 17:12:18 2023 +0100
@@ -2,7 +2,11 @@
    "statusCode": 200,
    "body": {
       "message": {
-         "objectKey": "analyzed/san_balance_f0799678-a362-4b7f-9fff-c26b0bbf2b15.json",
+         "companyTicker": "san",
+         "docType": "balance",
+         "fileId": "50d30446-015c-47bd-bb33-af487120c0d5",
+         "fileName": "san_balance_50d30446-015c-47bd-bb33-af487120c0d5.json",
+         "objectKey": "analyzed/san_balance_50d30446-015c-47bd-bb33-af487120c0d5.json",
          "bucketName": "sandbox-finance-parser-data"
       }
    }
--- a/events/upload_document_event.json	Mon Jun 05 12:48:47 2023 +0100
+++ b/events/upload_document_event.json	Mon Jun 05 17:12:18 2023 +0100
@@ -2,7 +2,11 @@
    "statusCode": 200,
    "body": {
       "message": {
-         "objectKey": "processed/san_balance_d7312109-9099-4dd2-a984-55768641b25e.json",
+         "companyTicker": "san",
+         "docType": "balance",
+         "fileId": "50d30446-015c-47bd-bb33-af487120c0d5",
+         "fileName": "san_balance_50d30446-015c-47bd-bb33-af487120c0d5.json",
+         "objectKey": "processed/san_balance_50d30446-015c-47bd-bb33-af487120c0d5.json",
          "bucketName": "sandbox-finance-parser-data"
       }
    }
--- a/process_document/app.py	Mon Jun 05 12:48:47 2023 +0100
+++ b/process_document/app.py	Mon Jun 05 17:12:18 2023 +0100
@@ -51,7 +51,7 @@
                 cell_text += '_'
 
         # Verify if `Text` could be a valid date
-        date_string = is_date(clean_text(cell_text, 'date'))
+        date_string = is_date(cell_text)
         if date_string:
             cell_text = date_string
             result['dateRow'] = cell['RowIndex']
@@ -121,11 +121,23 @@
             date = datetime.strptime(string_date, format_allowed)
 
             if date.year > datetime.now().year or date.year < 1900:
-                return  # Date out of range date
+                return  # Fecha fuera de rango
 
             return date.strftime("%Y")
         except ValueError:
-            continue
+
+            # Try removing characters from the beginning and end
+            options = [string_date[:-1], string_date[1:], string_date[1:-1]]
+            for option in options:
+                try:
+                    date = datetime.strptime(option, format_allowed)
+
+                    if date.year > datetime.now().year or date.year < 1900:
+                        return  # Fecha fuera de rango
+
+                    return date.strftime("%Y")
+                except ValueError:
+                    continue
 
     return
 
@@ -157,15 +169,8 @@
 
     if text_type == 'date':
         allowed_chars = ['_', '-', '/']
-
-        # Sometimes date is '2020a' or 'b2020' because indexes
-        if text[-1].isalpha():
-            special_chars.append(text[-1])
-
-        if text[0].isalpha():
-            special_chars.append(text[0])
     else:
-        allowed_chars = ['.', ',', '-', ' ']
+        allowed_chars = ['_']
 
     special_chars = [char for char in special_chars if char not in allowed_chars]
 
--- a/template.yaml	Mon Jun 05 12:48:47 2023 +0100
+++ b/template.yaml	Mon Jun 05 17:12:18 2023 +0100
@@ -1,6 +1,6 @@
 AWSTemplateFormatVersion: '2010-09-09'
 Transform: AWS::Serverless-2016-10-31
-Description: Serverless balance sheet analyzer using Textract and a serverless API
+Description: Serverless finance staments analyzer using Textract and a serverless API
 
 Conditions:
   CreateProdResources: !Equals
@@ -168,4 +168,9 @@
         - AttributeName: pk
           AttributeType: S
         - AttributeName: sk
-          AttributeType: S
\ No newline at end of file
+          AttributeType: S
+
+Outputs:
+  GetReportEndpoint:
+    Description: "Endpoint to get a report"
+    Value: !Sub "https://${Api}.execute-api.${AWS::Region}.amazonaws.com/Prod/report"
\ No newline at end of file
--- a/upload_document/app.py	Mon Jun 05 12:48:47 2023 +0100
+++ b/upload_document/app.py	Mon Jun 05 17:12:18 2023 +0100
@@ -56,7 +56,8 @@
                         'sk': f'{date}#{row_index}',
                         'account_name': account['1'],
                         'account_value': account_value,
-                        'column_types': column_types
+                        'column_types': column_types,
+                        'format': doc['format']
                     }
                 )
 
@@ -65,7 +66,7 @@
 
         table.put_item(
             Item={
-                'pk': f"file#{event_msg['companyTicker']}",
+                'pk': f"file#balance#{event_msg['companyTicker']}",
                 'sk': f"{date}#{event_msg['objectKey'].replace('processed/', '')}"
             }
         )