comparison analyze_document/app.py @ 3:2e5f3664f3e4

documents analyzer almost finished
author Dennis C. M. <dennis@denniscm.com>
date Fri, 02 Jun 2023 20:12:29 +0100
parents
children 9005b7590008
comparison
equal deleted inserted replaced
2:ef8a4d95755a 3:2e5f3664f3e4
1 import json
2 import boto3
3 import uuid
4 import re
5
6
7 textract_client = boto3.client('textract')
8 s3_client = boto3.client('s3')
9
10
11 def lambda_handler(event, context):
12 event_detail = event['detail']
13 bucket_name = event_detail['bucket']['name']
14 object_key = event_detail['object']['key']
15 company_ticker = re.search('unprocessed/(.*).pdf', object_key).group(1)
16
17 data_dict = textract_client.analyze_document(
18 Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}},
19 FeatureTypes=['TABLES']
20 )
21
22 data_string = json.dumps(data_dict, indent=2, default=str)
23 filename = f'{company_ticker}_{uuid.uuid4()}.json'
24
25 s3_client.put_object(
26 Bucket=bucket_name,
27 Key=f'analyzed/{filename}',
28 Body=data_string
29 )
30
31 s3_client.delete_object(
32 Bucket=bucket_name,
33 Key=object_key
34 )
35
36 return {
37 "statusCode": 200,
38 "body": {
39 "message": {
40 "objectKey": f'analyzed/{filename}',
41 "bucketName": bucket_name
42 }
43 },
44 }