Use OCR to fetch text in images

author Michael <heluecht@pirati.ca>

Sat, 13 Jan 2024 19:30:20 +0000 (19:30 +0000)

committer Michael <heluecht@pirati.ca>

Sat, 13 Jan 2024 19:30:20 +0000 (19:30 +0000)
author Michael <heluecht@pirati.ca>
Sat, 13 Jan 2024 19:30:20 +0000 (19:30 +0000)
committer Michael <heluecht@pirati.ca>
Sat, 13 Jan 2024 19:30:20 +0000 (19:30 +0000)
diff --git a/composer.json b/composer.json

index 21603c7b2731bdb500f962dc611c2fd1bd6368cb..903a3fab06501065955e6240121569534d2458f6 100644 (file)
--- a/composer.json
+++ b/composer.json
@@ -75,7 +75,8 @@
                 "npm-asset/moment": "^2.24",
                 "npm-asset/perfect-scrollbar": "0.6.16",
                 "npm-asset/textcomplete": "^0.18.2",
-               "npm-asset/typeahead.js": "^0.11.1"
+               "npm-asset/typeahead.js": "^0.11.1",
+               "thiagoalessio/tesseract_ocr": "^2.13"
         },
         "suggest": {
                 "ext-imagick": "For faster image processing",
diff --git a/composer.lock b/composer.lock

index a541913811b7f87f0e6dff3585e3c332dfe2d7b8..9442d41823456da8260ed7916df5bbc00748b6a7 100644 (file)
--- a/composer.lock
+++ b/composer.lock
@@ -4,7 +4,7 @@
          "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
          "This file is @generated automatically"
      ],
-    "content-hash": "082b16e2c88895f1a03d5b0ffe678ba7",
+    "content-hash": "131ca83d1c6f64092ff5220e4a14a101",
      "packages": [
          {
              "name": "asika/simple-console",
@@ -1317,6 +1317,24 @@
                  "html",
                  "markdown"
              ],
+            "funding": [
+                {
+                    "url": "https://www.colinodell.com/sponsor",
+                    "type": "custom"
+                },
+                {
+                    "url": "https://www.paypal.me/colinpodell/10.00",
+                    "type": "custom"
+                },
+                {
+                    "url": "https://github.com/colinodell",
+                    "type": "github"
+                },
+                {
+                    "url": "https://www.patreon.com/colinodell",
+                    "type": "patreon"
+                }
+            ],
              "time": "2020-07-01T00:34:03+00:00"
          },
          {
@@ -4224,6 +4242,50 @@
              ],
              "time": "2023-01-26T09:26:14+00:00"
          },
+        {
+            "name": "thiagoalessio/tesseract_ocr",
+            "version": "2.13.0",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
+                "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+                "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+                "shasum": ""
+            },
+            "require": {
+                "php": "^5.3 || ^7.0 || ^8.0"
+            },
+            "require-dev": {
+                "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
+            },
+            "type": "library",
+            "autoload": {
+                "psr-4": {
+                    "thiagoalessio\\TesseractOCR\\": "src/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "MIT"
+            ],
+            "authors": [
+                {
+                    "name": "thiagoalessio",
+                    "email": "thiagoalessio@me.com"
+                }
+            ],
+            "description": "A wrapper to work with Tesseract OCR inside PHP.",
+            "keywords": [
+                "OCR",
+                "Tesseract",
+                "text recognition"
+            ],
+            "time": "2023-10-05T21:14:48+00:00"
+        },
          {
              "name": "ua-parser/uap-php",
              "version": "v3.9.14",
diff --git a/src/Model/Post/Media.php b/src/Model/Post/Media.php

index df05db98d567e3ab4a886286a524218c12e36f4b..afd6ca8383538222807e16200c055565314b1266 100644 (file)
--- a/src/Model/Post/Media.php
+++ b/src/Model/Post/Media.php
@@ -208,13 +208,17 @@ class Media
                 $filetype = !empty($media['mimetype']) ? strtolower(current(explode('/', $media['mimetype']))) : '';
  
                 if (($media['type'] == self::IMAGE) || ($filetype == 'image')) {
-                       $imagedata = Images::getInfoFromURLCached($media['url']);
+                       $imagedata = Images::getInfoFromURLCached($media['url'], empty($media['description']));
                         if ($imagedata) {
                                 $media['mimetype'] = $imagedata['mime'];
                                 $media['size'] = $imagedata['size'];
                                 $media['width'] = $imagedata[0];
                                 $media['height'] = $imagedata[1];
                                 $media['blurhash'] = $imagedata['blurhash'] ?? null;
+                               if (!empty($imagedata['description']) && empty($media['description'])) {
+                                       $media['description'] = $imagedata['description'];
+                                       Logger::debug('Detected text for image', $media);
+                               }
                         } else {
                                 Logger::notice('No image data', ['media' => $media]);
                         }
diff --git a/src/Util/Images.php b/src/Util/Images.php

index b44b1fb8f558fe0a4dbc76078dc531e4dc8a8155..0d64601f1d5b0ecd39c07cc63012dca3bf3bb162 100644 (file)
--- a/src/Util/Images.php
+++ b/src/Util/Images.php
@@ -22,10 +22,12 @@
  namespace Friendica\Util;
  
  use Friendica\Core\Logger;
+use Friendica\Core\System;
  use Friendica\DI;
  use Friendica\Model\Photo;
  use Friendica\Network\HTTPClient\Client\HttpClientAccept;
  use Friendica\Object\Image;
+use thiagoalessio\TesseractOCR\TesseractOCR;
  
  /**
   * Image utilities
@@ -181,10 +183,11 @@ class Images
          * Gets info array from given URL, cached data has priority
          *
          * @param string $url
+        * @param bool   $ocr
          * @return array Info
          * @throws \Friendica\Network\HTTPException\InternalServerErrorException
          */
-       public static function getInfoFromURLCached(string $url): array
+       public static function getInfoFromURLCached(string $url, bool $ocr = false): array
         {
                 $data = [];
  
@@ -192,12 +195,12 @@ class Images
                         return $data;
                 }
  
-               $cacheKey = 'getInfoFromURL:' . sha1($url);
+               $cacheKey = 'getInfoFromURL:' . sha1($url . $ocr);
  
                 $data = DI::cache()->get($cacheKey);
  
                 if (empty($data) || !is_array($data)) {
-                       $data = self::getInfoFromURL($url);
+                       $data = self::getInfoFromURL($url, $ocr);
  
                         DI::cache()->set($cacheKey, $data);
                 }
@@ -209,10 +212,11 @@ class Images
          * Gets info from URL uncached
          *
          * @param string $url
+        * @param bool   $ocr
          * @return array Info array
          * @throws \Friendica\Network\HTTPException\InternalServerErrorException
          */
-       public static function getInfoFromURL(string $url): array
+       public static function getInfoFromURL(string $url, bool $ocr = false): array
         {
                 $data = [];
  
@@ -257,6 +261,17 @@ class Images
  
                 if ($image->isValid()) {
                         $data['blurhash'] = $image->getBlurHash();
+                       
+                       if ($ocr && DI::config()->get('system', 'tesseract_ocr')) {
+                               $ocr = new TesseractOCR();
+                               try {
+                                       $ocr->tempDir(System::getTempPath());
+                                       $ocr->imageData($img_str, strlen($img_str));
+                                       $data['description'] = $ocr->run();
+                               } catch (\Throwable $th) {
+                                       Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]);
+                               }                       
+                       }
                 }
  
                 $data['size'] = $filesize;
diff --git a/static/defaults.config.php b/static/defaults.config.php

index 819b0ad85fb45d357d1fd9c2978820d1a0155f41..b3a7f49984af88d1ce63bebef590ad702b2a7132 100644 (file)
--- a/static/defaults.config.php
+++ b/static/defaults.config.php
@@ -441,6 +441,10 @@ return [
                 // Don't show smilies.
                 'no_smilies' => false,
  
+               // tesseract_ocr (Boolean)
+               // Use Tesseract OCR to use OCR to fetch text from images
+               'tesseract_ocr' => false,
+
                 // optimize_all_tables (Boolean)
                 // Optimizes all tables instead of only tables like workerqueue or the cache
                 'optimize_all_tables' => false,
author	Michael <heluecht@pirati.ca>
	Sat, 13 Jan 2024 19:30:20 +0000 (19:30 +0000)
committer	Michael <heluecht@pirati.ca>
	Sat, 13 Jan 2024 19:30:20 +0000 (19:30 +0000)
composer.json		patch \| blob \| history
composer.lock		patch \| blob \| history
src/Model/Post/Media.php		patch \| blob \| history
src/Util/Images.php		patch \| blob \| history
static/defaults.config.php		patch \| blob \| history