]> git.mxchange.org Git - friendica.git/commitdiff
Use OCR to fetch text in images
authorMichael <heluecht@pirati.ca>
Sat, 13 Jan 2024 19:30:20 +0000 (19:30 +0000)
committerMichael <heluecht@pirati.ca>
Sat, 13 Jan 2024 19:30:20 +0000 (19:30 +0000)
composer.json
composer.lock
src/Model/Post/Media.php
src/Util/Images.php
static/defaults.config.php

index 21603c7b2731bdb500f962dc611c2fd1bd6368cb..903a3fab06501065955e6240121569534d2458f6 100644 (file)
@@ -75,7 +75,8 @@
                "npm-asset/moment": "^2.24",
                "npm-asset/perfect-scrollbar": "0.6.16",
                "npm-asset/textcomplete": "^0.18.2",
-               "npm-asset/typeahead.js": "^0.11.1"
+               "npm-asset/typeahead.js": "^0.11.1",
+               "thiagoalessio/tesseract_ocr": "^2.13"
        },
        "suggest": {
                "ext-imagick": "For faster image processing",
index a541913811b7f87f0e6dff3585e3c332dfe2d7b8..9442d41823456da8260ed7916df5bbc00748b6a7 100644 (file)
@@ -4,7 +4,7 @@
         "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
         "This file is @generated automatically"
     ],
-    "content-hash": "082b16e2c88895f1a03d5b0ffe678ba7",
+    "content-hash": "131ca83d1c6f64092ff5220e4a14a101",
     "packages": [
         {
             "name": "asika/simple-console",
                 "html",
                 "markdown"
             ],
+            "funding": [
+                {
+                    "url": "https://www.colinodell.com/sponsor",
+                    "type": "custom"
+                },
+                {
+                    "url": "https://www.paypal.me/colinpodell/10.00",
+                    "type": "custom"
+                },
+                {
+                    "url": "https://github.com/colinodell",
+                    "type": "github"
+                },
+                {
+                    "url": "https://www.patreon.com/colinodell",
+                    "type": "patreon"
+                }
+            ],
             "time": "2020-07-01T00:34:03+00:00"
         },
         {
             ],
             "time": "2023-01-26T09:26:14+00:00"
         },
+        {
+            "name": "thiagoalessio/tesseract_ocr",
+            "version": "2.13.0",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
+                "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+                "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+                "shasum": ""
+            },
+            "require": {
+                "php": "^5.3 || ^7.0 || ^8.0"
+            },
+            "require-dev": {
+                "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
+            },
+            "type": "library",
+            "autoload": {
+                "psr-4": {
+                    "thiagoalessio\\TesseractOCR\\": "src/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "MIT"
+            ],
+            "authors": [
+                {
+                    "name": "thiagoalessio",
+                    "email": "thiagoalessio@me.com"
+                }
+            ],
+            "description": "A wrapper to work with Tesseract OCR inside PHP.",
+            "keywords": [
+                "OCR",
+                "Tesseract",
+                "text recognition"
+            ],
+            "time": "2023-10-05T21:14:48+00:00"
+        },
         {
             "name": "ua-parser/uap-php",
             "version": "v3.9.14",
index df05db98d567e3ab4a886286a524218c12e36f4b..afd6ca8383538222807e16200c055565314b1266 100644 (file)
@@ -208,13 +208,17 @@ class Media
                $filetype = !empty($media['mimetype']) ? strtolower(current(explode('/', $media['mimetype']))) : '';
 
                if (($media['type'] == self::IMAGE) || ($filetype == 'image')) {
-                       $imagedata = Images::getInfoFromURLCached($media['url']);
+                       $imagedata = Images::getInfoFromURLCached($media['url'], empty($media['description']));
                        if ($imagedata) {
                                $media['mimetype'] = $imagedata['mime'];
                                $media['size'] = $imagedata['size'];
                                $media['width'] = $imagedata[0];
                                $media['height'] = $imagedata[1];
                                $media['blurhash'] = $imagedata['blurhash'] ?? null;
+                               if (!empty($imagedata['description']) && empty($media['description'])) {
+                                       $media['description'] = $imagedata['description'];
+                                       Logger::debug('Detected text for image', $media);
+                               }
                        } else {
                                Logger::notice('No image data', ['media' => $media]);
                        }
index b44b1fb8f558fe0a4dbc76078dc531e4dc8a8155..0d64601f1d5b0ecd39c07cc63012dca3bf3bb162 100644 (file)
 namespace Friendica\Util;
 
 use Friendica\Core\Logger;
+use Friendica\Core\System;
 use Friendica\DI;
 use Friendica\Model\Photo;
 use Friendica\Network\HTTPClient\Client\HttpClientAccept;
 use Friendica\Object\Image;
+use thiagoalessio\TesseractOCR\TesseractOCR;
 
 /**
  * Image utilities
@@ -181,10 +183,11 @@ class Images
         * Gets info array from given URL, cached data has priority
         *
         * @param string $url
+        * @param bool   $ocr
         * @return array Info
         * @throws \Friendica\Network\HTTPException\InternalServerErrorException
         */
-       public static function getInfoFromURLCached(string $url): array
+       public static function getInfoFromURLCached(string $url, bool $ocr = false): array
        {
                $data = [];
 
@@ -192,12 +195,12 @@ class Images
                        return $data;
                }
 
-               $cacheKey = 'getInfoFromURL:' . sha1($url);
+               $cacheKey = 'getInfoFromURL:' . sha1($url . $ocr);
 
                $data = DI::cache()->get($cacheKey);
 
                if (empty($data) || !is_array($data)) {
-                       $data = self::getInfoFromURL($url);
+                       $data = self::getInfoFromURL($url, $ocr);
 
                        DI::cache()->set($cacheKey, $data);
                }
@@ -209,10 +212,11 @@ class Images
         * Gets info from URL uncached
         *
         * @param string $url
+        * @param bool   $ocr
         * @return array Info array
         * @throws \Friendica\Network\HTTPException\InternalServerErrorException
         */
-       public static function getInfoFromURL(string $url): array
+       public static function getInfoFromURL(string $url, bool $ocr = false): array
        {
                $data = [];
 
@@ -257,6 +261,17 @@ class Images
 
                if ($image->isValid()) {
                        $data['blurhash'] = $image->getBlurHash();
+                       
+                       if ($ocr && DI::config()->get('system', 'tesseract_ocr')) {
+                               $ocr = new TesseractOCR();
+                               try {
+                                       $ocr->tempDir(System::getTempPath());
+                                       $ocr->imageData($img_str, strlen($img_str));
+                                       $data['description'] = $ocr->run();
+                               } catch (\Throwable $th) {
+                                       Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]);
+                               }                       
+                       }
                }
 
                $data['size'] = $filesize;
index 819b0ad85fb45d357d1fd9c2978820d1a0155f41..b3a7f49984af88d1ce63bebef590ad702b2a7132 100644 (file)
@@ -441,6 +441,10 @@ return [
                // Don't show smilies.
                'no_smilies' => false,
 
+               // tesseract_ocr (Boolean)
+               // Use Tesseract OCR to use OCR to fetch text from images
+               'tesseract_ocr' => false,
+
                // optimize_all_tables (Boolean)
                // Optimizes all tables instead of only tables like workerqueue or the cache
                'optimize_all_tables' => false,