Commit 5d771fea authored by Maiushana Sutheshan's avatar Maiushana Sutheshan

Delete questionExxtraction.py

parent 4904bf9b
import json
import os, io
import re
from google.cloud import vision
from google.cloud import storage
from google.protobuf import json_format
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'soloexamimages-973d884bd8dd.json'
client = vision.ImageAnnotatorClient()
storage_client = storage.Client()
def upload_blob(bucket_name, source_file_name, destination_blob_name):
"""Uploads a file to the bucket."""
# The ID of your GCS bucket
# bucket_name = "your-bucket-name"
# The path to your file to upload
# source_file_name = "local/path/to/file"
# The ID of your GCS object
# destination_blob_name = "storage-object-name"
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(source_file_name)
print(
"File {} uploaded to {}.".format(
source_file_name, destination_blob_name
)
)
def delete_blob(bucket_name, blob_name):
"""Deletes a blob from the bucket."""
# bucket_name = "your-bucket-name"
# blob_name = "your-object-name"
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.delete()
print("Blob {} deleted.".format(blob_name))
batch_size = 1
mime_type = 'application/pdf'
feature = vision.Feature(
type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
fileName="input.pdf"
inputbucket="soloexamocrinputbucket"
outputbucket="soloexamocroutputbucket"
outputprefix = ''
outputfile='out.txt'
upload_blob(inputbucket,fileName,fileName)
gcs_source_uri = 'gs://'+inputbucket+'/'+fileName
gcs_source = vision.GcsSource(uri=gcs_source_uri)
input_config = vision.InputConfig(
gcs_source=gcs_source, mime_type=mime_type)
gcs_destination_uri = 'gs://'+outputbucket+'/'+outputprefix
gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
output_config = vision.OutputConfig(
gcs_destination=gcs_destination, batch_size=batch_size)
async_request = vision.AsyncAnnotateFileRequest(
features=[feature], input_config=input_config,
output_config=output_config)
operation = client.async_batch_annotate_files(
requests=[async_request])
operation.result(timeout=18000)
bucket = storage_client.get_bucket(outputbucket)
blob_list = list(bucket.list_blobs(prefix=outputprefix))
namelist=list(map(lambda x:x.name,blob_list))
blob_list.sort(key=lambda x: int(re.match(r'.+-(\d+)-.+',x.name).group(1)))
print('Output files:')
delete_blob(inputbucket,fileName)
with open(outputfile, 'w',encoding='utf8') as f:
for item in blob_list:
json_string = item.download_as_string()
# response = json_format.Parse(json_string)
response = json.loads(json_string)
print(type(response))
for singlePageResponse in response["responses"]:
print(type(singlePageResponse))
print(type(singlePageResponse))
full_text_annotation=singlePageResponse["fullTextAnnotation"]
text=full_text_annotation["text"]
f.write(text)
blob_list = list(bucket.list_blobs(prefix=outputprefix))
for bl in blob_list:
delete_blob(outputbucket,bl.name)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment