Mercurial > public > finance-parser
comparison analyze_document/app.py @ 3:2e5f3664f3e4
documents analyzer almost finished
author | Dennis C. M. <dennis@denniscm.com> |
---|---|
date | Fri, 02 Jun 2023 20:12:29 +0100 |
parents | |
children | 9005b7590008 |
comparison
equal
deleted
inserted
replaced
2:ef8a4d95755a | 3:2e5f3664f3e4 |
---|---|
1 import json | |
2 import boto3 | |
3 import uuid | |
4 import re | |
5 | |
6 | |
7 textract_client = boto3.client('textract') | |
8 s3_client = boto3.client('s3') | |
9 | |
10 | |
11 def lambda_handler(event, context): | |
12 event_detail = event['detail'] | |
13 bucket_name = event_detail['bucket']['name'] | |
14 object_key = event_detail['object']['key'] | |
15 company_ticker = re.search('unprocessed/(.*).pdf', object_key).group(1) | |
16 | |
17 data_dict = textract_client.analyze_document( | |
18 Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}}, | |
19 FeatureTypes=['TABLES'] | |
20 ) | |
21 | |
22 data_string = json.dumps(data_dict, indent=2, default=str) | |
23 filename = f'{company_ticker}_{uuid.uuid4()}.json' | |
24 | |
25 s3_client.put_object( | |
26 Bucket=bucket_name, | |
27 Key=f'analyzed/{filename}', | |
28 Body=data_string | |
29 ) | |
30 | |
31 s3_client.delete_object( | |
32 Bucket=bucket_name, | |
33 Key=object_key | |
34 ) | |
35 | |
36 return { | |
37 "statusCode": 200, | |
38 "body": { | |
39 "message": { | |
40 "objectKey": f'analyzed/{filename}', | |
41 "bucketName": bucket_name | |
42 } | |
43 }, | |
44 } |