From 3a60229e5c0925bea134e89c91d2a75a0246f622 Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 13 Jan 2024 19:30:20 +0000 Subject: [PATCH] Use OCR to fetch text in images --- composer.json | 3 +- composer.lock | 64 +++++++++++++++++++++++++++++++++++++- src/Model/Post/Media.php | 6 +++- src/Util/Images.php | 23 +++++++++++--- static/defaults.config.php | 4 +++ 5 files changed, 93 insertions(+), 7 deletions(-) diff --git a/composer.json b/composer.json index 21603c7b27..903a3fab06 100644 --- a/composer.json +++ b/composer.json @@ -75,7 +75,8 @@ "npm-asset/moment": "^2.24", "npm-asset/perfect-scrollbar": "0.6.16", "npm-asset/textcomplete": "^0.18.2", - "npm-asset/typeahead.js": "^0.11.1" + "npm-asset/typeahead.js": "^0.11.1", + "thiagoalessio/tesseract_ocr": "^2.13" }, "suggest": { "ext-imagick": "For faster image processing", diff --git a/composer.lock b/composer.lock index a541913811..9442d41823 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "082b16e2c88895f1a03d5b0ffe678ba7", + "content-hash": "131ca83d1c6f64092ff5220e4a14a101", "packages": [ { "name": "asika/simple-console", @@ -1317,6 +1317,24 @@ "html", "markdown" ], + "funding": [ + { + "url": "https://www.colinodell.com/sponsor", + "type": "custom" + }, + { + "url": "https://www.paypal.me/colinpodell/10.00", + "type": "custom" + }, + { + "url": "https://github.com/colinodell", + "type": "github" + }, + { + "url": "https://www.patreon.com/colinodell", + "type": "patreon" + } + ], "time": "2020-07-01T00:34:03+00:00" }, { @@ -4224,6 +4242,50 @@ ], "time": "2023-01-26T09:26:14+00:00" }, + { + "name": "thiagoalessio/tesseract_ocr", + "version": "2.13.0", + "source": { + "type": "git", + "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git", + "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1", + "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1", + "shasum": "" + }, + "require": { + "php": "^5.3 || ^7.0 || ^8.0" + }, + "require-dev": { + "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "thiagoalessio\\TesseractOCR\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "thiagoalessio", + "email": "thiagoalessio@me.com" + } + ], + "description": "A wrapper to work with Tesseract OCR inside PHP.", + "keywords": [ + "OCR", + "Tesseract", + "text recognition" + ], + "time": "2023-10-05T21:14:48+00:00" + }, { "name": "ua-parser/uap-php", "version": "v3.9.14", diff --git a/src/Model/Post/Media.php b/src/Model/Post/Media.php index df05db98d5..afd6ca8383 100644 --- a/src/Model/Post/Media.php +++ b/src/Model/Post/Media.php @@ -208,13 +208,17 @@ class Media $filetype = !empty($media['mimetype']) ? strtolower(current(explode('/', $media['mimetype']))) : ''; if (($media['type'] == self::IMAGE) || ($filetype == 'image')) { - $imagedata = Images::getInfoFromURLCached($media['url']); + $imagedata = Images::getInfoFromURLCached($media['url'], empty($media['description'])); if ($imagedata) { $media['mimetype'] = $imagedata['mime']; $media['size'] = $imagedata['size']; $media['width'] = $imagedata[0]; $media['height'] = $imagedata[1]; $media['blurhash'] = $imagedata['blurhash'] ?? null; + if (!empty($imagedata['description']) && empty($media['description'])) { + $media['description'] = $imagedata['description']; + Logger::debug('Detected text for image', $media); + } } else { Logger::notice('No image data', ['media' => $media]); } diff --git a/src/Util/Images.php b/src/Util/Images.php index b44b1fb8f5..0d64601f1d 100644 --- a/src/Util/Images.php +++ b/src/Util/Images.php @@ -22,10 +22,12 @@ namespace Friendica\Util; use Friendica\Core\Logger; +use Friendica\Core\System; use Friendica\DI; use Friendica\Model\Photo; use Friendica\Network\HTTPClient\Client\HttpClientAccept; use Friendica\Object\Image; +use thiagoalessio\TesseractOCR\TesseractOCR; /** * Image utilities @@ -181,10 +183,11 @@ class Images * Gets info array from given URL, cached data has priority * * @param string $url + * @param bool $ocr * @return array Info * @throws \Friendica\Network\HTTPException\InternalServerErrorException */ - public static function getInfoFromURLCached(string $url): array + public static function getInfoFromURLCached(string $url, bool $ocr = false): array { $data = []; @@ -192,12 +195,12 @@ class Images return $data; } - $cacheKey = 'getInfoFromURL:' . sha1($url); + $cacheKey = 'getInfoFromURL:' . sha1($url . $ocr); $data = DI::cache()->get($cacheKey); if (empty($data) || !is_array($data)) { - $data = self::getInfoFromURL($url); + $data = self::getInfoFromURL($url, $ocr); DI::cache()->set($cacheKey, $data); } @@ -209,10 +212,11 @@ class Images * Gets info from URL uncached * * @param string $url + * @param bool $ocr * @return array Info array * @throws \Friendica\Network\HTTPException\InternalServerErrorException */ - public static function getInfoFromURL(string $url): array + public static function getInfoFromURL(string $url, bool $ocr = false): array { $data = []; @@ -257,6 +261,17 @@ class Images if ($image->isValid()) { $data['blurhash'] = $image->getBlurHash(); + + if ($ocr && DI::config()->get('system', 'tesseract_ocr')) { + $ocr = new TesseractOCR(); + try { + $ocr->tempDir(System::getTempPath()); + $ocr->imageData($img_str, strlen($img_str)); + $data['description'] = $ocr->run(); + } catch (\Throwable $th) { + Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]); + } + } } $data['size'] = $filesize; diff --git a/static/defaults.config.php b/static/defaults.config.php index 819b0ad85f..b3a7f49984 100644 --- a/static/defaults.config.php +++ b/static/defaults.config.php @@ -441,6 +441,10 @@ return [ // Don't show smilies. 'no_smilies' => false, + // tesseract_ocr (Boolean) + // Use Tesseract OCR to use OCR to fetch text from images + 'tesseract_ocr' => false, + // optimize_all_tables (Boolean) // Optimizes all tables instead of only tables like workerqueue or the cache 'optimize_all_tables' => false, -- 2.39.5