Add ocr

7 years ago · aeb5135746
parent 65119499ae
commit aeb5135746
5 changed files with 77 additions and 0 deletions
--- a/ocr/README.md
+++ b/ocr/README.md
@ -0,0 +1,32 @@
 """Get image timestamps from Swift camera"""
 import os
 from glob import glob
 from datetime import datetime
 from tqdm import tqdm
 from PIL import Image
 import pandas as pd
 import pytesseract
 input_dir = 'jpg'
 jpg_names = glob(os.path.join(input_dir, '*.jpg'))
 dates = []
 for jpg_name in tqdm(jpg_names):
    im = Image.open(jpg_name)
    # Crop image
    w, h = im.size
    im = im.crop((w - 550, h - 40, w, h))
    # Perform OCR with tesseract
    text = pytesseract.image_to_string(im, lang='eng')
    # Convert to datetime object
    dates.append(datetime.strptime(text, '%d/%m/%Y %H:%M:%S'))
 # Save as csv
 df = pd.DataFrame(data=dates, index=jpg_names, columns=['date'])
 df.index.name = 'file'
 df.to_csv('image-dates.csv')
--- a/ocr/image-dates.csv
+++ b/ocr/image-dates.csv
@ -0,0 +1,13 @@
 file,date
 jpg\0000-SYCR0314.jpg,2019-04-25 07:00:02
 jpg\0000-SYCR0315.jpg,2019-04-25 08:00:02
 jpg\0000-SYCR0316.jpg,2019-04-25 09:00:02
 jpg\0000-SYCR0317.jpg,2019-04-25 10:00:02
 jpg\0000-SYCR0318.jpg,2019-04-25 11:00:02
 jpg\0000-SYCR0319.jpg,2019-04-25 12:00:01
 jpg\0000-SYCR0320.jpg,2019-04-25 13:00:01
 jpg\0000-SYCR0321.jpg,2019-04-25 14:00:01
 jpg\0000-SYCR0322.jpg,2019-04-25 15:00:01
 jpg\0000-SYCR0323.jpg,2019-04-25 16:00:01
 jpg\0000-SYCR0324.jpg,2019-04-25 17:00:01
 jpg\0000-SYCR0325.jpg,2019-04-26 07:00:02
--- a/ocr/jpg/0000-SYCR0314.jpg
+++ b/ocr/jpg/0000-SYCR0314.jpg
--- a/ocr/jpg/0000-SYCR0315.jpg
+++ b/ocr/jpg/0000-SYCR0315.jpg
--- a/ocr/swift_ocr.py
+++ b/ocr/swift_ocr.py
@ -0,0 +1,32 @@
 """Get image timestamps from Swift camera"""
 import os
 from glob import glob
 from datetime import datetime
 from tqdm import tqdm
 from PIL import Image
 import pandas as pd
 import pytesseract
 input_dir = 'jpg'
 jpg_names = glob(os.path.join(input_dir, '*.jpg'))
 dates = []
 for jpg_name in tqdm(jpg_names):
    im = Image.open(jpg_name)
    # Crop image
    w, h = im.size
    im = im.crop((w - 550, h - 40, w, h))
    # Perform OCR with tesseract
    text = pytesseract.image_to_string(im, lang='eng')
    # Convert to datetime object
    dates.append(datetime.strptime(text, '%d/%m/%Y %H:%M:%S'))
 # Save as csv
 df = pd.DataFrame(data=dates, index=jpg_names, columns=['date'])
 df.index.name = 'file'
 df.to_csv('image-dates.csv')