Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
21_22-J050
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
21_22-J050
21_22-J050
Commits
07d89dfe
Commit
07d89dfe
authored
Apr 28, 2022
by
Maiushana Sutheshan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ocr changes added
parent
3813e655
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
106 additions
and
0 deletions
+106
-0
questionExxtraction.py
questionExxtraction.py
+106
-0
No files found.
questionExxtraction.py
0 → 100644
View file @
07d89dfe
import
json
import
os
,
io
import
re
from
google.cloud
import
vision
from
google.cloud
import
storage
from
google.protobuf
import
json_format
os
.
environ
[
'GOOGLE_APPLICATION_CREDENTIALS'
]
=
r'soloexamimages-973d884bd8dd.json'
client
=
vision
.
ImageAnnotatorClient
()
storage_client
=
storage
.
Client
()
def
upload_blob
(
bucket_name
,
source_file_name
,
destination_blob_name
):
"""Uploads a file to the bucket."""
# The ID of your GCS bucket
# bucket_name = "your-bucket-name"
# The path to your file to upload
# source_file_name = "local/path/to/file"
# The ID of your GCS object
# destination_blob_name = "storage-object-name"
bucket
=
storage_client
.
bucket
(
bucket_name
)
blob
=
bucket
.
blob
(
destination_blob_name
)
blob
.
upload_from_filename
(
source_file_name
)
print
(
"File {} uploaded to {}."
.
format
(
source_file_name
,
destination_blob_name
)
)
def
delete_blob
(
bucket_name
,
blob_name
):
"""Deletes a blob from the bucket."""
# bucket_name = "your-bucket-name"
# blob_name = "your-object-name"
bucket
=
storage_client
.
bucket
(
bucket_name
)
blob
=
bucket
.
blob
(
blob_name
)
blob
.
delete
()
print
(
"Blob {} deleted."
.
format
(
blob_name
))
batch_size
=
1
mime_type
=
'application/pdf'
feature
=
vision
.
Feature
(
type_
=
vision
.
Feature
.
Type
.
DOCUMENT_TEXT_DETECTION
)
fileName
=
"input.pdf"
inputbucket
=
"soloexamocrinputbucket"
outputbucket
=
"soloexamocroutputbucket"
outputprefix
=
''
outputfile
=
'out.txt'
upload_blob
(
inputbucket
,
fileName
,
fileName
)
gcs_source_uri
=
'gs://'
+
inputbucket
+
'/'
+
fileName
gcs_source
=
vision
.
GcsSource
(
uri
=
gcs_source_uri
)
input_config
=
vision
.
InputConfig
(
gcs_source
=
gcs_source
,
mime_type
=
mime_type
)
gcs_destination_uri
=
'gs://'
+
outputbucket
+
'/'
+
outputprefix
gcs_destination
=
vision
.
GcsDestination
(
uri
=
gcs_destination_uri
)
output_config
=
vision
.
OutputConfig
(
gcs_destination
=
gcs_destination
,
batch_size
=
batch_size
)
async_request
=
vision
.
AsyncAnnotateFileRequest
(
features
=
[
feature
],
input_config
=
input_config
,
output_config
=
output_config
)
operation
=
client
.
async_batch_annotate_files
(
requests
=
[
async_request
])
operation
.
result
(
timeout
=
18000
)
bucket
=
storage_client
.
get_bucket
(
outputbucket
)
blob_list
=
list
(
bucket
.
list_blobs
(
prefix
=
outputprefix
))
namelist
=
list
(
map
(
lambda
x
:
x
.
name
,
blob_list
))
blob_list
.
sort
(
key
=
lambda
x
:
int
(
re
.
match
(
r'.+-(\d+)-.+'
,
x
.
name
)
.
group
(
1
)))
print
(
'Output files:'
)
delete_blob
(
inputbucket
,
fileName
)
with
open
(
outputfile
,
'w'
,
encoding
=
'utf8'
)
as
f
:
for
item
in
blob_list
:
json_string
=
item
.
download_as_string
()
# response = json_format.Parse(json_string)
response
=
json
.
loads
(
json_string
)
print
(
type
(
response
))
for
singlePageResponse
in
response
[
"responses"
]:
print
(
type
(
singlePageResponse
))
print
(
type
(
singlePageResponse
))
full_text_annotation
=
singlePageResponse
[
"fullTextAnnotation"
]
text
=
full_text_annotation
[
"text"
]
f
.
write
(
text
)
blob_list
=
list
(
bucket
.
list_blobs
(
prefix
=
outputprefix
))
for
bl
in
blob_list
:
delete_blob
(
outputbucket
,
bl
.
name
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment