"npm-asset/moment": "^2.24",
"npm-asset/perfect-scrollbar": "0.6.16",
"npm-asset/textcomplete": "^0.18.2",
- "npm-asset/typeahead.js": "^0.11.1"
+ "npm-asset/typeahead.js": "^0.11.1",
+ "thiagoalessio/tesseract_ocr": "^2.13"
},
"suggest": {
"ext-imagick": "For faster image processing",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
- "content-hash": "082b16e2c88895f1a03d5b0ffe678ba7",
+ "content-hash": "131ca83d1c6f64092ff5220e4a14a101",
"packages": [
{
"name": "asika/simple-console",
"html",
"markdown"
],
+ "funding": [
+ {
+ "url": "https://www.colinodell.com/sponsor",
+ "type": "custom"
+ },
+ {
+ "url": "https://www.paypal.me/colinpodell/10.00",
+ "type": "custom"
+ },
+ {
+ "url": "https://github.com/colinodell",
+ "type": "github"
+ },
+ {
+ "url": "https://www.patreon.com/colinodell",
+ "type": "patreon"
+ }
+ ],
"time": "2020-07-01T00:34:03+00:00"
},
{
],
"time": "2023-01-26T09:26:14+00:00"
},
+ {
+ "name": "thiagoalessio/tesseract_ocr",
+ "version": "2.13.0",
+ "source": {
+ "type": "git",
+ "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
+ "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
+ },
+ "dist": {
+ "type": "zip",
+ "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+ "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+ "shasum": ""
+ },
+ "require": {
+ "php": "^5.3 || ^7.0 || ^8.0"
+ },
+ "require-dev": {
+ "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
+ },
+ "type": "library",
+ "autoload": {
+ "psr-4": {
+ "thiagoalessio\\TesseractOCR\\": "src/"
+ }
+ },
+ "notification-url": "https://packagist.org/downloads/",
+ "license": [
+ "MIT"
+ ],
+ "authors": [
+ {
+ "name": "thiagoalessio",
+ "email": "thiagoalessio@me.com"
+ }
+ ],
+ "description": "A wrapper to work with Tesseract OCR inside PHP.",
+ "keywords": [
+ "OCR",
+ "Tesseract",
+ "text recognition"
+ ],
+ "time": "2023-10-05T21:14:48+00:00"
+ },
{
"name": "ua-parser/uap-php",
"version": "v3.9.14",
$filetype = !empty($media['mimetype']) ? strtolower(current(explode('/', $media['mimetype']))) : '';
if (($media['type'] == self::IMAGE) || ($filetype == 'image')) {
- $imagedata = Images::getInfoFromURLCached($media['url']);
+ $imagedata = Images::getInfoFromURLCached($media['url'], empty($media['description']));
if ($imagedata) {
$media['mimetype'] = $imagedata['mime'];
$media['size'] = $imagedata['size'];
$media['width'] = $imagedata[0];
$media['height'] = $imagedata[1];
$media['blurhash'] = $imagedata['blurhash'] ?? null;
+ if (!empty($imagedata['description']) && empty($media['description'])) {
+ $media['description'] = $imagedata['description'];
+ Logger::debug('Detected text for image', $media);
+ }
} else {
Logger::notice('No image data', ['media' => $media]);
}
namespace Friendica\Util;
use Friendica\Core\Logger;
+use Friendica\Core\System;
use Friendica\DI;
use Friendica\Model\Photo;
use Friendica\Network\HTTPClient\Client\HttpClientAccept;
use Friendica\Object\Image;
+use thiagoalessio\TesseractOCR\TesseractOCR;
/**
* Image utilities
* Gets info array from given URL, cached data has priority
*
* @param string $url
+ * @param bool $ocr
* @return array Info
* @throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
- public static function getInfoFromURLCached(string $url): array
+ public static function getInfoFromURLCached(string $url, bool $ocr = false): array
{
$data = [];
return $data;
}
- $cacheKey = 'getInfoFromURL:' . sha1($url);
+ $cacheKey = 'getInfoFromURL:' . sha1($url . $ocr);
$data = DI::cache()->get($cacheKey);
if (empty($data) || !is_array($data)) {
- $data = self::getInfoFromURL($url);
+ $data = self::getInfoFromURL($url, $ocr);
DI::cache()->set($cacheKey, $data);
}
* Gets info from URL uncached
*
* @param string $url
+ * @param bool $ocr
* @return array Info array
* @throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
- public static function getInfoFromURL(string $url): array
+ public static function getInfoFromURL(string $url, bool $ocr = false): array
{
$data = [];
if ($image->isValid()) {
$data['blurhash'] = $image->getBlurHash();
+
+ if ($ocr && DI::config()->get('system', 'tesseract_ocr')) {
+ $ocr = new TesseractOCR();
+ try {
+ $ocr->tempDir(System::getTempPath());
+ $ocr->imageData($img_str, strlen($img_str));
+ $data['description'] = $ocr->run();
+ } catch (\Throwable $th) {
+ Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]);
+ }
+ }
}
$data['size'] = $filesize;
// Don't show smilies.
'no_smilies' => false,
+ // tesseract_ocr (Boolean)
+ // Use Tesseract OCR to use OCR to fetch text from images
+ 'tesseract_ocr' => false,
+
// optimize_all_tables (Boolean)
// Optimizes all tables instead of only tables like workerqueue or the cache
'optimize_all_tables' => false,