--- /dev/null
+{
+ "require": {
+ "thiagoalessio/tesseract_ocr": "^2.13"
+ }
+}
--- /dev/null
+{
+ "_readme": [
+ "This file locks the dependencies of your project to a known state",
+ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
+ "This file is @generated automatically"
+ ],
+ "content-hash": "778b5479cb5d2b31b57f40473a87f8eb",
+ "packages": [
+ {
+ "name": "thiagoalessio/tesseract_ocr",
+ "version": "2.13.0",
+ "source": {
+ "type": "git",
+ "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
+ "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
+ },
+ "dist": {
+ "type": "zip",
+ "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+ "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+ "shasum": ""
+ },
+ "require": {
+ "php": "^5.3 || ^7.0 || ^8.0"
+ },
+ "require-dev": {
+ "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
+ },
+ "type": "library",
+ "autoload": {
+ "psr-4": {
+ "thiagoalessio\\TesseractOCR\\": "src/"
+ }
+ },
+ "notification-url": "https://packagist.org/downloads/",
+ "license": [
+ "MIT"
+ ],
+ "authors": [
+ {
+ "name": "thiagoalessio",
+ "email": "thiagoalessio@me.com"
+ }
+ ],
+ "description": "A wrapper to work with Tesseract OCR inside PHP.",
+ "keywords": [
+ "OCR",
+ "Tesseract",
+ "text recognition"
+ ],
+ "time": "2023-10-05T21:14:48+00:00"
+ }
+ ],
+ "packages-dev": [],
+ "aliases": [],
+ "minimum-stability": "stable",
+ "stability-flags": [],
+ "prefer-stable": false,
+ "prefer-lowest": false,
+ "platform": [],
+ "platform-dev": [],
+ "platform-overrides": {
+ "php": "7.2"
+ },
+ "plugin-api-version": "1.1.0"
+}
--- /dev/null
+<?php
+/**
+ * Name: Tesseract OCR
+ * Description: Use OCR to get text from images
+ * Version: 0.1
+ * Author: Michael Vogel <http://pirati.ca/profile/heluecht>
+ */
+
+use Friendica\Core\Hook;
+use Friendica\Core\Logger;
+use Friendica\Core\System;
+use thiagoalessio\TesseractOCR\TesseractOCR;
+
+require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php';
+
+function tesseract_install()
+{
+ Hook::register('ocr-detection', __FILE__, 'tesseract_ocr_detection');
+
+ Logger::notice('installed tesseract');
+}
+
+function tesseract_ocr_detection(&$media)
+{
+ $ocr = new TesseractOCR();
+ try {
+ $ocr->tempDir(System::getTempPath());
+ $ocr->imageData($media['img_str'], strlen($media['img_str']));
+ $media['description'] = $ocr->run();
+ } catch (\Throwable $th) {
+ Logger::info('Error calling TesseractOCR', ['message' => $th->getMessage()]);
+ }
+}
--- /dev/null
+<?php
+
+// autoload.php @generated by Composer
+
+require_once __DIR__ . '/composer/autoload_real.php';
+
+return ComposerAutoloaderInit695d781792f754383aa61632167d066e::getLoader();
--- /dev/null
+<?php
+
+/*
+ * This file is part of Composer.
+ *
+ * (c) Nils Adermann <naderman@naderman.de>
+ * Jordi Boggiano <j.boggiano@seld.be>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+namespace Composer\Autoload;
+
+/**
+ * ClassLoader implements a PSR-0, PSR-4 and classmap class loader.
+ *
+ * $loader = new \Composer\Autoload\ClassLoader();
+ *
+ * // register classes with namespaces
+ * $loader->add('Symfony\Component', __DIR__.'/component');
+ * $loader->add('Symfony', __DIR__.'/framework');
+ *
+ * // activate the autoloader
+ * $loader->register();
+ *
+ * // to enable searching the include path (eg. for PEAR packages)
+ * $loader->setUseIncludePath(true);
+ *
+ * In this example, if you try to use a class in the Symfony\Component
+ * namespace or one of its children (Symfony\Component\Console for instance),
+ * the autoloader will first look for the class under the component/
+ * directory, and it will then fallback to the framework/ directory if not
+ * found before giving up.
+ *
+ * This class is loosely based on the Symfony UniversalClassLoader.
+ *
+ * @author Fabien Potencier <fabien@symfony.com>
+ * @author Jordi Boggiano <j.boggiano@seld.be>
+ * @see http://www.php-fig.org/psr/psr-0/
+ * @see http://www.php-fig.org/psr/psr-4/
+ */
+class ClassLoader
+{
+ // PSR-4
+ private $prefixLengthsPsr4 = array();
+ private $prefixDirsPsr4 = array();
+ private $fallbackDirsPsr4 = array();
+
+ // PSR-0
+ private $prefixesPsr0 = array();
+ private $fallbackDirsPsr0 = array();
+
+ private $useIncludePath = false;
+ private $classMap = array();
+ private $classMapAuthoritative = false;
+ private $missingClasses = array();
+ private $apcuPrefix;
+
+ public function getPrefixes()
+ {
+ if (!empty($this->prefixesPsr0)) {
+ return call_user_func_array('array_merge', array_values($this->prefixesPsr0));
+ }
+
+ return array();
+ }
+
+ public function getPrefixesPsr4()
+ {
+ return $this->prefixDirsPsr4;
+ }
+
+ public function getFallbackDirs()
+ {
+ return $this->fallbackDirsPsr0;
+ }
+
+ public function getFallbackDirsPsr4()
+ {
+ return $this->fallbackDirsPsr4;
+ }
+
+ public function getClassMap()
+ {
+ return $this->classMap;
+ }
+
+ /**
+ * @param array $classMap Class to filename map
+ */
+ public function addClassMap(array $classMap)
+ {
+ if ($this->classMap) {
+ $this->classMap = array_merge($this->classMap, $classMap);
+ } else {
+ $this->classMap = $classMap;
+ }
+ }
+
+ /**
+ * Registers a set of PSR-0 directories for a given prefix, either
+ * appending or prepending to the ones previously set for this prefix.
+ *
+ * @param string $prefix The prefix
+ * @param array|string $paths The PSR-0 root directories
+ * @param bool $prepend Whether to prepend the directories
+ */
+ public function add($prefix, $paths, $prepend = false)
+ {
+ if (!$prefix) {
+ if ($prepend) {
+ $this->fallbackDirsPsr0 = array_merge(
+ (array) $paths,
+ $this->fallbackDirsPsr0
+ );
+ } else {
+ $this->fallbackDirsPsr0 = array_merge(
+ $this->fallbackDirsPsr0,
+ (array) $paths
+ );
+ }
+
+ return;
+ }
+
+ $first = $prefix[0];
+ if (!isset($this->prefixesPsr0[$first][$prefix])) {
+ $this->prefixesPsr0[$first][$prefix] = (array) $paths;
+
+ return;
+ }
+ if ($prepend) {
+ $this->prefixesPsr0[$first][$prefix] = array_merge(
+ (array) $paths,
+ $this->prefixesPsr0[$first][$prefix]
+ );
+ } else {
+ $this->prefixesPsr0[$first][$prefix] = array_merge(
+ $this->prefixesPsr0[$first][$prefix],
+ (array) $paths
+ );
+ }
+ }
+
+ /**
+ * Registers a set of PSR-4 directories for a given namespace, either
+ * appending or prepending to the ones previously set for this namespace.
+ *
+ * @param string $prefix The prefix/namespace, with trailing '\\'
+ * @param array|string $paths The PSR-4 base directories
+ * @param bool $prepend Whether to prepend the directories
+ *
+ * @throws \InvalidArgumentException
+ */
+ public function addPsr4($prefix, $paths, $prepend = false)
+ {
+ if (!$prefix) {
+ // Register directories for the root namespace.
+ if ($prepend) {
+ $this->fallbackDirsPsr4 = array_merge(
+ (array) $paths,
+ $this->fallbackDirsPsr4
+ );
+ } else {
+ $this->fallbackDirsPsr4 = array_merge(
+ $this->fallbackDirsPsr4,
+ (array) $paths
+ );
+ }
+ } elseif (!isset($this->prefixDirsPsr4[$prefix])) {
+ // Register directories for a new namespace.
+ $length = strlen($prefix);
+ if ('\\' !== $prefix[$length - 1]) {
+ throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator.");
+ }
+ $this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length;
+ $this->prefixDirsPsr4[$prefix] = (array) $paths;
+ } elseif ($prepend) {
+ // Prepend directories for an already registered namespace.
+ $this->prefixDirsPsr4[$prefix] = array_merge(
+ (array) $paths,
+ $this->prefixDirsPsr4[$prefix]
+ );
+ } else {
+ // Append directories for an already registered namespace.
+ $this->prefixDirsPsr4[$prefix] = array_merge(
+ $this->prefixDirsPsr4[$prefix],
+ (array) $paths
+ );
+ }
+ }
+
+ /**
+ * Registers a set of PSR-0 directories for a given prefix,
+ * replacing any others previously set for this prefix.
+ *
+ * @param string $prefix The prefix
+ * @param array|string $paths The PSR-0 base directories
+ */
+ public function set($prefix, $paths)
+ {
+ if (!$prefix) {
+ $this->fallbackDirsPsr0 = (array) $paths;
+ } else {
+ $this->prefixesPsr0[$prefix[0]][$prefix] = (array) $paths;
+ }
+ }
+
+ /**
+ * Registers a set of PSR-4 directories for a given namespace,
+ * replacing any others previously set for this namespace.
+ *
+ * @param string $prefix The prefix/namespace, with trailing '\\'
+ * @param array|string $paths The PSR-4 base directories
+ *
+ * @throws \InvalidArgumentException
+ */
+ public function setPsr4($prefix, $paths)
+ {
+ if (!$prefix) {
+ $this->fallbackDirsPsr4 = (array) $paths;
+ } else {
+ $length = strlen($prefix);
+ if ('\\' !== $prefix[$length - 1]) {
+ throw new \InvalidArgumentException("A non-empty PSR-4 prefix must end with a namespace separator.");
+ }
+ $this->prefixLengthsPsr4[$prefix[0]][$prefix] = $length;
+ $this->prefixDirsPsr4[$prefix] = (array) $paths;
+ }
+ }
+
+ /**
+ * Turns on searching the include path for class files.
+ *
+ * @param bool $useIncludePath
+ */
+ public function setUseIncludePath($useIncludePath)
+ {
+ $this->useIncludePath = $useIncludePath;
+ }
+
+ /**
+ * Can be used to check if the autoloader uses the include path to check
+ * for classes.
+ *
+ * @return bool
+ */
+ public function getUseIncludePath()
+ {
+ return $this->useIncludePath;
+ }
+
+ /**
+ * Turns off searching the prefix and fallback directories for classes
+ * that have not been registered with the class map.
+ *
+ * @param bool $classMapAuthoritative
+ */
+ public function setClassMapAuthoritative($classMapAuthoritative)
+ {
+ $this->classMapAuthoritative = $classMapAuthoritative;
+ }
+
+ /**
+ * Should class lookup fail if not found in the current class map?
+ *
+ * @return bool
+ */
+ public function isClassMapAuthoritative()
+ {
+ return $this->classMapAuthoritative;
+ }
+
+ /**
+ * APCu prefix to use to cache found/not-found classes, if the extension is enabled.
+ *
+ * @param string|null $apcuPrefix
+ */
+ public function setApcuPrefix($apcuPrefix)
+ {
+ $this->apcuPrefix = function_exists('apcu_fetch') && filter_var(ini_get('apc.enabled'), FILTER_VALIDATE_BOOLEAN) ? $apcuPrefix : null;
+ }
+
+ /**
+ * The APCu prefix in use, or null if APCu caching is not enabled.
+ *
+ * @return string|null
+ */
+ public function getApcuPrefix()
+ {
+ return $this->apcuPrefix;
+ }
+
+ /**
+ * Registers this instance as an autoloader.
+ *
+ * @param bool $prepend Whether to prepend the autoloader or not
+ */
+ public function register($prepend = false)
+ {
+ spl_autoload_register(array($this, 'loadClass'), true, $prepend);
+ }
+
+ /**
+ * Unregisters this instance as an autoloader.
+ */
+ public function unregister()
+ {
+ spl_autoload_unregister(array($this, 'loadClass'));
+ }
+
+ /**
+ * Loads the given class or interface.
+ *
+ * @param string $class The name of the class
+ * @return bool|null True if loaded, null otherwise
+ */
+ public function loadClass($class)
+ {
+ if ($file = $this->findFile($class)) {
+ includeFile($file);
+
+ return true;
+ }
+ }
+
+ /**
+ * Finds the path to the file where the class is defined.
+ *
+ * @param string $class The name of the class
+ *
+ * @return string|false The path if found, false otherwise
+ */
+ public function findFile($class)
+ {
+ // class map lookup
+ if (isset($this->classMap[$class])) {
+ return $this->classMap[$class];
+ }
+ if ($this->classMapAuthoritative || isset($this->missingClasses[$class])) {
+ return false;
+ }
+ if (null !== $this->apcuPrefix) {
+ $file = apcu_fetch($this->apcuPrefix.$class, $hit);
+ if ($hit) {
+ return $file;
+ }
+ }
+
+ $file = $this->findFileWithExtension($class, '.php');
+
+ // Search for Hack files if we are running on HHVM
+ if (false === $file && defined('HHVM_VERSION')) {
+ $file = $this->findFileWithExtension($class, '.hh');
+ }
+
+ if (null !== $this->apcuPrefix) {
+ apcu_add($this->apcuPrefix.$class, $file);
+ }
+
+ if (false === $file) {
+ // Remember that this class does not exist.
+ $this->missingClasses[$class] = true;
+ }
+
+ return $file;
+ }
+
+ private function findFileWithExtension($class, $ext)
+ {
+ // PSR-4 lookup
+ $logicalPathPsr4 = strtr($class, '\\', DIRECTORY_SEPARATOR) . $ext;
+
+ $first = $class[0];
+ if (isset($this->prefixLengthsPsr4[$first])) {
+ $subPath = $class;
+ while (false !== $lastPos = strrpos($subPath, '\\')) {
+ $subPath = substr($subPath, 0, $lastPos);
+ $search = $subPath . '\\';
+ if (isset($this->prefixDirsPsr4[$search])) {
+ $pathEnd = DIRECTORY_SEPARATOR . substr($logicalPathPsr4, $lastPos + 1);
+ foreach ($this->prefixDirsPsr4[$search] as $dir) {
+ if (file_exists($file = $dir . $pathEnd)) {
+ return $file;
+ }
+ }
+ }
+ }
+ }
+
+ // PSR-4 fallback dirs
+ foreach ($this->fallbackDirsPsr4 as $dir) {
+ if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr4)) {
+ return $file;
+ }
+ }
+
+ // PSR-0 lookup
+ if (false !== $pos = strrpos($class, '\\')) {
+ // namespaced class name
+ $logicalPathPsr0 = substr($logicalPathPsr4, 0, $pos + 1)
+ . strtr(substr($logicalPathPsr4, $pos + 1), '_', DIRECTORY_SEPARATOR);
+ } else {
+ // PEAR-like class name
+ $logicalPathPsr0 = strtr($class, '_', DIRECTORY_SEPARATOR) . $ext;
+ }
+
+ if (isset($this->prefixesPsr0[$first])) {
+ foreach ($this->prefixesPsr0[$first] as $prefix => $dirs) {
+ if (0 === strpos($class, $prefix)) {
+ foreach ($dirs as $dir) {
+ if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) {
+ return $file;
+ }
+ }
+ }
+ }
+ }
+
+ // PSR-0 fallback dirs
+ foreach ($this->fallbackDirsPsr0 as $dir) {
+ if (file_exists($file = $dir . DIRECTORY_SEPARATOR . $logicalPathPsr0)) {
+ return $file;
+ }
+ }
+
+ // PSR-0 include paths.
+ if ($this->useIncludePath && $file = stream_resolve_include_path($logicalPathPsr0)) {
+ return $file;
+ }
+
+ return false;
+ }
+}
+
+/**
+ * Scope isolated include.
+ *
+ * Prevents access to $this/self from included files.
+ */
+function includeFile($file)
+{
+ include $file;
+}
--- /dev/null
+
+Copyright (c) Nils Adermann, Jordi Boggiano
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished
+to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
--- /dev/null
+<?php
+
+// autoload_classmap.php @generated by Composer
+
+$vendorDir = dirname(dirname(__FILE__));
+$baseDir = dirname($vendorDir);
+
+return array(
+);
--- /dev/null
+<?php
+
+// autoload_namespaces.php @generated by Composer
+
+$vendorDir = dirname(dirname(__FILE__));
+$baseDir = dirname($vendorDir);
+
+return array(
+);
--- /dev/null
+<?php
+
+// autoload_psr4.php @generated by Composer
+
+$vendorDir = dirname(dirname(__FILE__));
+$baseDir = dirname($vendorDir);
+
+return array(
+ 'thiagoalessio\\TesseractOCR\\' => array($vendorDir . '/thiagoalessio/tesseract_ocr/src'),
+);
--- /dev/null
+<?php
+
+// autoload_real.php @generated by Composer
+
+class ComposerAutoloaderInit695d781792f754383aa61632167d066e
+{
+ private static $loader;
+
+ public static function loadClassLoader($class)
+ {
+ if ('Composer\Autoload\ClassLoader' === $class) {
+ require __DIR__ . '/ClassLoader.php';
+ }
+ }
+
+ /**
+ * @return \Composer\Autoload\ClassLoader
+ */
+ public static function getLoader()
+ {
+ if (null !== self::$loader) {
+ return self::$loader;
+ }
+
+ spl_autoload_register(array('ComposerAutoloaderInit695d781792f754383aa61632167d066e', 'loadClassLoader'), true, true);
+ self::$loader = $loader = new \Composer\Autoload\ClassLoader();
+ spl_autoload_unregister(array('ComposerAutoloaderInit695d781792f754383aa61632167d066e', 'loadClassLoader'));
+
+ $useStaticLoader = PHP_VERSION_ID >= 50600 && !defined('HHVM_VERSION') && (!function_exists('zend_loader_file_encoded') || !zend_loader_file_encoded());
+ if ($useStaticLoader) {
+ require_once __DIR__ . '/autoload_static.php';
+
+ call_user_func(\Composer\Autoload\ComposerStaticInit695d781792f754383aa61632167d066e::getInitializer($loader));
+ } else {
+ $map = require __DIR__ . '/autoload_namespaces.php';
+ foreach ($map as $namespace => $path) {
+ $loader->set($namespace, $path);
+ }
+
+ $map = require __DIR__ . '/autoload_psr4.php';
+ foreach ($map as $namespace => $path) {
+ $loader->setPsr4($namespace, $path);
+ }
+
+ $classMap = require __DIR__ . '/autoload_classmap.php';
+ if ($classMap) {
+ $loader->addClassMap($classMap);
+ }
+ }
+
+ $loader->register(true);
+
+ return $loader;
+ }
+}
--- /dev/null
+<?php
+
+// autoload_static.php @generated by Composer
+
+namespace Composer\Autoload;
+
+class ComposerStaticInit695d781792f754383aa61632167d066e
+{
+ public static $prefixLengthsPsr4 = array (
+ 't' =>
+ array (
+ 'thiagoalessio\\TesseractOCR\\' => 27,
+ ),
+ );
+
+ public static $prefixDirsPsr4 = array (
+ 'thiagoalessio\\TesseractOCR\\' =>
+ array (
+ 0 => __DIR__ . '/..' . '/thiagoalessio/tesseract_ocr/src',
+ ),
+ );
+
+ public static function getInitializer(ClassLoader $loader)
+ {
+ return \Closure::bind(function () use ($loader) {
+ $loader->prefixLengthsPsr4 = ComposerStaticInit695d781792f754383aa61632167d066e::$prefixLengthsPsr4;
+ $loader->prefixDirsPsr4 = ComposerStaticInit695d781792f754383aa61632167d066e::$prefixDirsPsr4;
+
+ }, null, ClassLoader::class);
+ }
+}
--- /dev/null
+[
+ {
+ "name": "thiagoalessio/tesseract_ocr",
+ "version": "2.13.0",
+ "version_normalized": "2.13.0.0",
+ "source": {
+ "type": "git",
+ "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git",
+ "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1"
+ },
+ "dist": {
+ "type": "zip",
+ "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+ "reference": "232a8cb9d571992f9bd1e263f2f6909cf6c173a1",
+ "shasum": ""
+ },
+ "require": {
+ "php": "^5.3 || ^7.0 || ^8.0"
+ },
+ "require-dev": {
+ "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
+ },
+ "time": "2023-10-05T21:14:48+00:00",
+ "type": "library",
+ "installation-source": "dist",
+ "autoload": {
+ "psr-4": {
+ "thiagoalessio\\TesseractOCR\\": "src/"
+ }
+ },
+ "notification-url": "https://packagist.org/downloads/",
+ "license": [
+ "MIT"
+ ],
+ "authors": [
+ {
+ "name": "thiagoalessio",
+ "email": "thiagoalessio@me.com"
+ }
+ ],
+ "description": "A wrapper to work with Tesseract OCR inside PHP.",
+ "keywords": [
+ "OCR",
+ "Tesseract",
+ "text recognition"
+ ]
+ }
+]
--- /dev/null
+---
+build: false
+
+install:
+ - ps: Set-Service wuauserv -StartupType Manual
+ - choco install php
+ - choco install capture2text --version 3.9
+ - choco install composer
+ - refreshenv
+ - cd %APPVEYOR_BUILD_FOLDER%
+ - composer install
+
+test_script:
+ - php tests\run.php unit e2e
--- /dev/null
+Copyright (c) 2012-2021 Thiago Alessio Pereira
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- /dev/null
+<img src="https://thiagoalessio.github.io/tesseract-ocr-for-php/images/logo.png" alt="Tesseract OCR for PHP" align="right" width="320px"/>
+
+# Tesseract OCR for PHP
+
+A wrapper to work with Tesseract OCR inside PHP.
+
+[![CI][ci_badge]][ci]
+[![AppVeyor][appveyor_badge]][appveyor]
+[![Codacy][codacy_badge]][codacy]
+[![Test Coverage][test_coverage_badge]][test_coverage]
+<br/>
+[![Latest Stable Version][stable_version_badge]][packagist]
+[![Total Downloads][total_downloads_badge]][packagist]
+[![Monthly Downloads][monthly_downloads_badge]][packagist]
+
+## Installation
+
+Via [Composer][]:
+
+ $ composer require thiagoalessio/tesseract_ocr
+
+:bangbang: **This library depends on [Tesseract OCR][], version _3.02_ or later.**
+
+<br/>
+
+### ![][windows_icon] Note for Windows users
+
+There are [many ways][tesseract_installation_on_windows] to install
+[Tesseract OCR][] on your system, but if you just want something quick to
+get up and running, I recommend installing the [Capture2Text][] package with
+[Chocolatey][].
+
+ choco install capture2text --version 3.9
+
+:warning: Recent versions of [Capture2Text][] stopped shipping the `tesseract` binary.
+
+<br/>
+
+### ![][macos_icon] Note for macOS users
+
+With [MacPorts][] you can install support for individual languages, like so:
+
+ $ sudo port install tesseract-<langcode>
+
+But that is not possible with [Homebrew][]. It comes only with **English** support
+by default, so if you intend to use it for other language, the quickest solution
+is to install them all:
+
+ $ brew install tesseract tesseract-lang
+
+<br/>
+
+## Usage
+
+### Basic usage
+
+<img align="right" width="50%" title="The quick brown fox jumps over the lazy dog." src="./tests/EndToEnd/images/text.png"/>
+
+```php
+use thiagoalessio\TesseractOCR\TesseractOCR;
+echo (new TesseractOCR('text.png'))
+ ->run();
+```
+
+```
+The quick brown fox
+jumps over
+the lazy dog.
+```
+
+<br/>
+
+### Other languages
+
+<img align="right" width="50%" title="Bülowstraße" src="./tests/EndToEnd/images/german.png"/>
+
+```php
+use thiagoalessio\TesseractOCR\TesseractOCR;
+echo (new TesseractOCR('german.png'))
+ ->lang('deu')
+ ->run();
+```
+
+```
+Bülowstraße
+```
+
+<br/>
+
+### Multiple languages
+
+<img align="right" width="50%" title="I eat すし y Pollo" src="./tests/EndToEnd/images/mixed-languages.png"/>
+
+```php
+use thiagoalessio\TesseractOCR\TesseractOCR;
+echo (new TesseractOCR('mixed-languages.png'))
+ ->lang('eng', 'jpn', 'spa')
+ ->run();
+```
+
+```
+I eat すし y Pollo
+```
+
+<br/>
+
+### Inducing recognition
+
+<img align="right" width="50%" title="8055" src="./tests/EndToEnd/images/8055.png"/>
+
+```php
+use thiagoalessio\TesseractOCR\TesseractOCR;
+echo (new TesseractOCR('8055.png'))
+ ->allowlist(range('A', 'Z'))
+ ->run();
+```
+
+```
+BOSS
+```
+
+<br/>
+
+### Breaking CAPTCHAs
+
+Yes, I know some of you might want to use this library for the *noble* purpose
+of breaking CAPTCHAs, so please take a look at this comment:
+
+<https://github.com/thiagoalessio/tesseract-ocr-for-php/issues/91#issuecomment-342290510>
+
+## API
+
+### run
+
+Executes a `tesseract` command, optionally receiving an integer as `timeout`,
+in case you experience stalled tesseract processes.
+
+```php
+$ocr = new TesseractOCR();
+$ocr->run();
+```
+```php
+$ocr = new TesseractOCR();
+$timeout = 500;
+$ocr->run($timeout);
+```
+
+### image
+
+Define the path of an image to be recognized by `tesseract`.
+
+```php
+$ocr = new TesseractOCR();
+$ocr->image('/path/to/image.png');
+$ocr->run();
+```
+
+### imageData
+
+Set the image to be recognized by `tesseract` from a string, with its size.
+This can be useful when dealing with files that are already loaded in memory.
+You can easily retrieve the image data and size of an image object :
+```php
+//Using Imagick
+$data = $img->getImageBlob();
+$size = $img->getImageLength();
+//Using GD
+ob_start();
+// Note that you can use any format supported by tesseract
+imagepng($img, null, 0);
+$size = ob_get_length();
+$data = ob_get_clean();
+
+$ocr = new TesseractOCR();
+$ocr->imageData($data, $size);
+$ocr->run();
+```
+
+### executable
+
+Define a custom location of the `tesseract` executable,
+if by any reason it is not present in the `$PATH`.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->executable('/path/to/tesseract')
+ ->run();
+```
+
+### version
+
+Returns the current version of `tesseract`.
+
+```php
+echo (new TesseractOCR())->version();
+```
+
+### availableLanguages
+
+Returns a list of available languages/scripts.
+
+```php
+foreach((new TesseractOCR())->availableLanguages() as $lang) echo $lang;
+```
+
+__More info:__ <https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages-and-scripts>
+
+### tessdataDir
+
+Specify a custom location for the tessdata directory.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->tessdataDir('/path')
+ ->run();
+```
+
+### userWords
+
+Specify the location of user words file.
+
+This is a plain text file containing a list of words that you want to be
+considered as a normal dictionary words by `tesseract`.
+
+Useful when dealing with contents that contain technical terminology, jargon,
+etc.
+
+```
+$ cat /path/to/user-words.txt
+foo
+bar
+```
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->userWords('/path/to/user-words.txt')
+ ->run();
+```
+
+### userPatterns
+
+Specify the location of user patterns file.
+
+If the contents you are dealing with have known patterns, this option can help
+a lot tesseract's recognition accuracy.
+
+```
+$ cat /path/to/user-patterns.txt'
+1-\d\d\d-GOOG-441
+www.\n\\\*.com
+```
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->userPatterns('/path/to/user-patterns.txt')
+ ->run();
+```
+
+### lang
+
+Define one or more languages to be used during the recognition.
+A complete list of available languages can be found at:
+<https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages>
+
+__Tip from [@daijiale][]:__ Use the combination `->lang('chi_sim', 'chi_tra')`
+for proper recognition of Chinese.
+
+```php
+ echo (new TesseractOCR('img.png'))
+ ->lang('lang1', 'lang2', 'lang3')
+ ->run();
+```
+
+### psm
+
+Specify the Page Segmentation Method, which instructs `tesseract` how to
+interpret the given image.
+
+__More info:__ <https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality#page-segmentation-method>
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->psm(6)
+ ->run();
+```
+
+### oem
+
+Specify the OCR Engine Mode. (see `tesseract --help-oem`)
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->oem(2)
+ ->run();
+```
+
+### dpi
+
+Specify the image DPI. It is useful if your image does not contain this information in its metadata.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->dpi(300)
+ ->run();
+```
+
+### allowlist
+
+This is a shortcut for `->config('tessedit_char_whitelist', 'abcdef....')`.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->allowlist(range('a', 'z'), range(0, 9), '-_@')
+ ->run();
+```
+
+### configFile
+
+Specify a config file to be used. It can either be the path to your own
+config file or the name of one of the predefined config files:
+<https://github.com/tesseract-ocr/tesseract/tree/master/tessdata/configs>
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->configFile('hocr')
+ ->run();
+```
+
+### setOutputFile
+
+Specify an Outputfile to be used. Be aware: If you set an outputfile then
+the option `withoutTempFiles` is ignored.
+Tempfiles are written (and deleted) even if `withoutTempFiles = true`.
+
+In combination with `configFile` you are able to get the `hocr`, `tsv` or
+`pdf` files.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->configFile('pdf')
+ ->setOutputFile('/PATH_TO_MY_OUTPUTFILE/searchable.pdf')
+ ->run();
+```
+
+### digits
+
+Shortcut for `->configFile('digits')`.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->digits()
+ ->run();
+```
+
+### hocr
+
+Shortcut for `->configFile('hocr')`.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->hocr()
+ ->run();
+```
+
+### pdf
+
+Shortcut for `->configFile('pdf')`.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->pdf()
+ ->run();
+```
+
+### quiet
+
+Shortcut for `->configFile('quiet')`.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->quiet()
+ ->run();
+```
+
+### tsv
+
+Shortcut for `->configFile('tsv')`.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->tsv()
+ ->run();
+```
+
+### txt
+
+Shortcut for `->configFile('txt')`.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->txt()
+ ->run();
+```
+
+### tempDir
+
+Define a custom directory to store temporary files generated by tesseract.
+Make sure the directory actually exists and the user running `php` is allowed
+to write in there.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->tempDir('./my/custom/temp/dir')
+ ->run();
+```
+
+### withoutTempFiles
+
+Specify that `tesseract` should output the recognized text without writing to temporary files.
+The data is gathered from the standard output of `tesseract` instead.
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->withoutTempFiles()
+ ->run();
+```
+
+### Other options
+
+Any configuration option offered by Tesseract can be used like that:
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->config('config_var', 'value')
+ ->config('other_config_var', 'other value')
+ ->run();
+```
+
+Or like that:
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->configVar('value')
+ ->otherConfigVar('other value')
+ ->run();
+```
+
+__More info:__ <https://github.com/tesseract-ocr/tesseract/wiki/ControlParams>
+
+### Thread-limit
+
+Sometimes, it may be useful to limit the number of threads that tesseract is
+allowed to use (e.g. in [this case](https://github.com/tesseract-ocr/tesseract/issues/898)).
+Set the maxmium number of threads as param for the `run` function:
+
+```php
+echo (new TesseractOCR('img.png'))
+ ->threadLimit(1)
+ ->run();
+```
+
+## How to contribute
+
+You can contribute to this project by:
+
+* Opening an [Issue][] if you found a bug or wish to propose a new feature;
+* Placing a [Pull Request][] with code that fix a bug, missing/wrong documentation
+ or implement a new feature;
+
+Just make sure you take a look at our [Code of Conduct][] and [Contributing][]
+instructions.
+
+## License
+
+tesseract-ocr-for-php is released under the [MIT License][].
+
+
+<h2></h2><p align="center"><sub>Made with <sub><a href="#"><img src="https://thiagoalessio.github.io/tesseract-ocr-for-php/images/heart.svg" alt="love" width="14px"/></a></sub> in Berlin</sub></p>
+
+[ci_badge]: https://github.com/thiagoalessio/tesseract-ocr-for-php/workflows/CI/badge.svg?event=push&branch=main
+[ci]: https://github.com/thiagoalessio/tesseract-ocr-for-php/actions?query=workflow%3ACI
+[appveyor_badge]: https://ci.appveyor.com/api/projects/status/xwy5ls0798iwcim3/branch/main?svg=true
+[appveyor]: https://ci.appveyor.com/project/thiagoalessio/tesseract-ocr-for-php/branch/main
+[codacy_badge]: https://app.codacy.com/project/badge/Grade/a81aa10012874f23a57df5b492d835f2
+[codacy]: https://www.codacy.com/gh/thiagoalessio/tesseract-ocr-for-php/dashboard
+[test_coverage_badge]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php/branch/main/graph/badge.svg?token=Y0VnrqiSIf
+[test_coverage]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php
+[stable_version_badge]: https://img.shields.io/packagist/v/thiagoalessio/tesseract_ocr.svg
+[packagist]: https://packagist.org/packages/thiagoalessio/tesseract_ocr
+[total_downloads_badge]: https://img.shields.io/packagist/dt/thiagoalessio/tesseract_ocr.svg
+[monthly_downloads_badge]: https://img.shields.io/packagist/dm/thiagoalessio/tesseract_ocr.svg
+[Tesseract OCR]: https://github.com/tesseract-ocr/tesseract
+[Composer]: http://getcomposer.org/
+[windows_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/windows-18.svg
+[macos_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/apple-18.svg
+[tesseract_installation_on_windows]: https://github.com/tesseract-ocr/tesseract/wiki#windows
+[Capture2Text]: https://chocolatey.org/packages/capture2text
+[Chocolatey]: https://chocolatey.org
+[MacPorts]: https://www.macports.org
+[Homebrew]: https://brew.sh
+[@daijiale]: https://github.com/daijiale
+[HOCR]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#hocr-output
+[TSV]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#tsv-output-currently-available-in-305-dev-in-master-branch-on-github
+[Issue]: https://github.com/thiagoalessio/tesseract-ocr-for-php/issues
+[Pull Request]: https://github.com/thiagoalessio/tesseract-ocr-for-php/pulls
+[Code of Conduct]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CODE_OF_CONDUCT.md
+[Contributing]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CONTRIBUTING.md
+[MIT License]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/MIT-LICENSE
--- /dev/null
+fixes:
+- "/home/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::"
+- "/Users/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::"
+- "C:\\projects\\tesseract-ocr-for-php\\::"
--- /dev/null
+{
+ "name": "thiagoalessio/tesseract_ocr",
+ "description": "A wrapper to work with Tesseract OCR inside PHP.",
+ "version": "2.13.0",
+ "type": "library",
+ "keywords": ["Tesseract", "OCR", "text recognition"],
+ "license": "MIT",
+ "authors": [
+ {
+ "name": "thiagoalessio",
+ "email": "thiagoalessio@me.com"
+ }
+ ],
+ "support": {
+ "issues": "https://github.com/thiagoalessio/tesseract-ocr-for-php/issues",
+ "irc": "irc://irc.freenode.net/tesseract-ocr-for-php",
+ "source": "https://github.com/thiagoalessio/tesseract-ocr-for-php"
+ },
+ "require": {
+ "php": "^5.3 || ^7.0 || ^8.0"
+ },
+ "require-dev": {
+ "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
+ },
+ "autoload": {
+ "psr-4": {
+ "thiagoalessio\\TesseractOCR\\": "src/"
+ }
+ },
+ "autoload-dev": {
+ "psr-4": {
+ "thiagoalessio\\TesseractOCR\\Tests\\": "tests/"
+ }
+ }
+}
--- /dev/null
+<?php namespace thiagoalessio\TesseractOCR;
+
+class Command
+{
+ public $executable = 'tesseract';
+ public $useFileAsInput = true;
+ public $useFileAsOutput = true;
+ public $options = array();
+ public $configFile;
+ public $tempDir;
+ public $threadLimit;
+ public $image;
+ public $imageSize;
+ private $outputFile;
+
+ public function __construct($image=null, $outputFile=null)
+ {
+ $this->image = $image;
+ $this->outputFile = $outputFile;
+ }
+
+ public function build() { return "$this"; }
+
+ public function __toString()
+ {
+ $cmd = array();
+ if ($this->threadLimit) $cmd[] = "OMP_THREAD_LIMIT={$this->threadLimit}";
+ $cmd[] = self::escape($this->executable);
+ $cmd[] = $this->useFileAsInput ? self::escape($this->image) : "-";
+ $cmd[] = $this->useFileAsOutput ? self::escape($this->getOutputFile(false)) : "-";
+
+ $version = $this->getTesseractVersion();
+
+ foreach ($this->options as $option) {
+ $cmd[] = is_callable($option) ? $option($version) : "$option";
+ }
+ if ($this->configFile) $cmd[] = $this->configFile;
+
+ return join(' ', $cmd);
+ }
+
+ public function getOutputFile($withExt=true)
+ {
+ if (!$this->outputFile)
+ $this->outputFile = $this->getTempDir()
+ .DIRECTORY_SEPARATOR
+ .basename(tempnam($this->getTempDir(), 'ocr'));
+ if (!$withExt) return $this->outputFile;
+
+ $hasCustomExt = array('hocr', 'tsv', 'pdf');
+ $ext = in_array($this->configFile, $hasCustomExt) ? $this->configFile : 'txt';
+ return "{$this->outputFile}.{$ext}";
+ }
+
+ public function getTempDir()
+ {
+ return $this->tempDir ?: sys_get_temp_dir();
+ }
+
+ public function getTesseractVersion()
+ {
+ exec(self::escape($this->executable).' --version 2>&1', $output);
+ $outputParts = explode(' ', $output[0]);
+ return $outputParts[1];
+ }
+
+ public function getAvailableLanguages()
+ {
+ exec(self::escape($this->executable) . ' --list-langs 2>&1', $output);
+ array_shift($output);
+ sort($output);
+ return $output;
+ }
+
+ public static function escape($str)
+ {
+ $charlist = strtoupper(substr(PHP_OS, 0, 3)) == 'WIN' ? '$"`' : '$"\\`';
+ return '"'.addcslashes($str, $charlist).'"';
+ }
+}
--- /dev/null
+<?php
+
+namespace thiagoalessio\TesseractOCR;
+
+class FeatureNotAvailableException extends TesseractOcrException
+{
+}
--- /dev/null
+<?php namespace thiagoalessio\TesseractOCR;
+
+class FriendlyErrors
+{
+ public static function checkImagePath($image)
+ {
+ if (file_exists($image)) return;
+
+ $currentDir = __DIR__;
+ $msg = array();
+ $msg[] = "Error! The image \"$image\" was not found.";
+ $msg[] = '';
+ $msg[] = "The current __DIR__ is $currentDir";
+ $msg = join(PHP_EOL, $msg);
+
+ throw new ImageNotFoundException($msg);
+ }
+
+ public static function checkTesseractPresence($executable)
+ {
+ if (file_exists($executable)) return;
+
+ $cmd = stripos(PHP_OS, 'win') === 0
+ ? 'where.exe '.Command::escape($executable).' > NUL 2>&1'
+ : 'type '.Command::escape($executable).' > /dev/null 2>&1';
+ system($cmd, $exitCode);
+
+ if ($exitCode == 0) return;
+
+ $currentPath = getenv('PATH');
+ $msg = array();
+ $msg[] = "Error! The command \"$executable\" was not found.";
+ $msg[] = '';
+ $msg[] = 'Make sure you have Tesseract OCR installed on your system:';
+ $msg[] = 'https://github.com/tesseract-ocr/tesseract';
+ $msg[] = '';
+ $msg[] = "The current \$PATH is $currentPath";
+ $msg = join(PHP_EOL, $msg);
+
+ throw new TesseractNotFoundException($msg);
+ }
+
+ public static function checkCommandExecution($command, $stdout, $stderr)
+ {
+ if ($command->useFileAsOutput) {
+ $file = $command->getOutputFile();
+ if (file_exists($file) && filesize($file) > 0) return;
+ }
+
+ if (!$command->useFileAsOutput && $stdout) {
+ return;
+ }
+
+ $msg = array();
+ $msg[] = 'Error! The command did not produce any output.';
+ $msg[] = '';
+ $msg[] = 'Generated command:';
+ $msg[] = "$command";
+ $msg[] = '';
+ $msg[] = 'Returned message:';
+ $arrayStderr = explode(PHP_EOL, $stderr);
+ array_pop($arrayStderr);
+ $msg = array_merge($msg, $arrayStderr);
+ $msg = join(PHP_EOL, $msg);
+
+ throw new UnsuccessfulCommandException($msg);
+ }
+
+ public static function checkProcessCreation($processHandle, $command)
+ {
+ if ($processHandle !== FALSE) return;
+
+ $msg = array();
+ $msg[] = 'Error! The command could not be launched.';
+ $msg[] = '';
+ $msg[] = 'Generated command:';
+ $msg[] = "$command";
+ $msg = join(PHP_EOL, $msg);
+
+ throw new UnsuccessfulCommandException($msg);
+ }
+
+ public static function checkTesseractVersion($expected, $action, $command)
+ {
+ $actual = $command->getTesseractVersion();
+
+ if ($actual[0] === 'v')
+ $actual = substr($actual, 1);
+
+ if (version_compare($actual, $expected, ">=")) return;
+
+ $msg = array();
+ $msg[] = "Error! $action is not available this tesseract version";
+ $msg[] = "Required version is $expected, actual version is $actual";
+ $msg[] = '';
+ $msg[] = 'Generated command:';
+ $msg[] = "$command";
+ $msg = join(PHP_EOL, $msg);
+
+ throw new FeatureNotAvailableException($msg);
+ }
+
+ public static function checkWritePermissions($path)
+ {
+ if (!is_dir(dirname($path))) mkdir(dirname($path));
+ $writableDirectory = is_writable(dirname($path));
+ $writableFile = true;
+ if (file_exists($path)) $writableFile = is_writable($path);
+ if ($writableFile && $writableDirectory) return;
+
+ $msg = array();
+ $msg[] = "Error! No permission to write to $path";
+ $msg[] = "Make sure you have the right outputFile and permissions "
+ ."to write to the folder";
+ $msg[] = '';
+ $msg = join(PHP_EOL, $msg);
+
+ throw new NoWritePermissionsForOutputFile($msg);
+ }
+}
--- /dev/null
+<?php
+
+namespace thiagoalessio\TesseractOCR;
+
+class ImageNotFoundException extends TesseractOcrException
+{
+}
--- /dev/null
+<?php
+
+namespace thiagoalessio\TesseractOCR;
+
+class NoWritePermissionsForOutputFile extends TesseractOcrException
+{
+}
--- /dev/null
+<?php namespace thiagoalessio\TesseractOCR;
+
+class Option
+{
+ public static function psm($psm)
+ {
+ return function($version) use ($psm) {
+ $version = preg_replace('/^v/', '', $version);
+ return (version_compare($version, 4, '>=') ? '-' : '')."-psm $psm";
+ };
+ }
+
+ public static function oem($oem)
+ {
+ return function($version) use ($oem) {
+ Option::checkMinVersion('3.05', $version, 'oem');
+ return "--oem $oem";
+ };
+ }
+
+ public static function dpi($dpi)
+ {
+ return function() use ($dpi) {
+ return "--dpi $dpi";
+ };
+ }
+
+ public static function userWords($path)
+ {
+ return function($version) use ($path) {
+ Option::checkMinVersion('3.04', $version, 'user-words');
+ return '--user-words "'.addcslashes($path, '\\"').'"';
+ };
+ }
+
+ public static function userPatterns($path)
+ {
+ return function($version) use ($path) {
+ Option::checkMinVersion('3.04', $version, 'user-patterns');
+ return '--user-patterns "'.addcslashes($path, '\\"').'"';
+ };
+ }
+
+ public static function tessdataDir($path)
+ {
+ return function() use ($path) {
+ return '--tessdata-dir "'.addcslashes($path, '\\"').'"';
+ };
+ }
+
+ public static function lang()
+ {
+ $languages = func_get_args();
+ return function() use ($languages) {
+ return '-l '.join('+', $languages);
+ };
+ }
+
+ public static function config($var, $value)
+ {
+ return function() use($var, $value) {
+ $snakeCase = function($str) {
+ return strtolower(preg_replace('/([A-Z])+/', '_$1', $str));
+ };
+ $pair = $snakeCase($var).'='.$value;
+ return '-c "'.addcslashes($pair, '\\"').'"';
+ };
+ }
+
+ public static function checkMinVersion($minVersion, $currVersion, $option)
+ {
+ $minVersion = preg_replace('/^v/', '', $minVersion);
+ $currVersion = preg_replace('/^v/', '', $currVersion);
+ if (!version_compare($currVersion, $minVersion, '<')) return;
+ $msg = "$option option is only available on Tesseract $minVersion or later.";
+ $msg.= PHP_EOL."Your version of Tesseract is $currVersion";
+ throw new \Exception($msg);
+ }
+}
--- /dev/null
+<?php namespace thiagoalessio\TesseractOCR;
+
+class Process {
+
+ private $stdin;
+ private $stdout;
+ private $stderr;
+ private $handle;
+ private $startTime;
+
+ public function __construct($command)
+ {
+ $this->startTime = microtime(true);
+ $streamDescriptors = [
+ array("pipe", "r"),
+ array("pipe", "w"),
+ array("pipe", "w")
+ ];
+ $this->handle = proc_open($command, $streamDescriptors, $pipes, NULL, NULL, ["bypass_shell" => true]);
+ list($this->stdin, $this->stdout, $this->stderr) = $pipes;
+
+ FriendlyErrors::checkProcessCreation($this->handle, $command);
+
+ //This is can avoid deadlock on some cases (when stderr buffer is filled up before writing to stdout and vice-versa)
+ stream_set_blocking($this->stdout, 0);
+ stream_set_blocking($this->stderr, 0);
+ }
+
+ public function write($data, $len)
+ {
+ $total = 0;
+ do
+ {
+ $res = fwrite($this->stdin, substr($data, $total));
+ } while($res && $total += $res < $len);
+ return $total === $len;
+ }
+
+
+ public function wait($timeout = 0)
+ {
+ $running = true;
+ $data = ["out" => "", "err" => ""];
+ while (($running === true) && !$this->hasTimedOut($timeout))
+ {
+ $data["out"] .= fread($this->stdout, 8192);
+ $data["err"] .= fread($this->stderr, 8192);
+ $procInfo = proc_get_status($this->handle);
+ $running = $procInfo["running"];
+ if ($running) {
+ usleep(1000); // Sleep 1ms to yield CPU time
+ }
+ }
+ return $data;
+ }
+
+ public function close()
+ {
+ $this->closeStream($this->stdin);
+ $this->closeStream($this->stdout);
+ $this->closeStream($this->stderr);
+ return proc_close($this->handle);
+ }
+
+ public function closeStdin()
+ {
+ $this->closeStream($this->stdin);
+ }
+
+ private function hasTimedOut($timeout)
+ {
+ return (($timeout > 0) && ($this->startTime + $timeout < microtime(true)));
+ }
+
+ private function closeStream(&$stream)
+ {
+ if ($stream !== NULL)
+ {
+ fclose($stream);
+ $stream = NULL;
+ }
+ }
+}
--- /dev/null
+<?php
+
+namespace thiagoalessio\TesseractOCR;
+
+class TesseractNotFoundException extends TesseractOcrException
+{
+}
--- /dev/null
+<?php namespace thiagoalessio\TesseractOCR;
+
+use thiagoalessio\TesseractOCR\Command;
+use thiagoalessio\TesseractOCR\Option;
+use thiagoalessio\TesseractOCR\FriendlyErrors;
+
+class TesseractOCR
+{
+ public $command;
+ private $outputFile = null;
+
+ public function __construct($image=null, $command=null)
+ {
+ $this->command = $command ?: new Command;
+ $this->image("$image");
+ }
+
+ public function run($timeout = 0)
+ {
+ try {
+ if ($this->outputFile !== null) {
+ FriendlyErrors::checkWritePermissions($this->outputFile);
+ $this->command->useFileAsOutput = true;
+ }
+
+ FriendlyErrors::checkTesseractPresence($this->command->executable);
+ if ($this->command->useFileAsInput) {
+ FriendlyErrors::checkImagePath($this->command->image);
+ }
+
+ $process = new Process("{$this->command}");
+
+ if (!$this->command->useFileAsInput) {
+ $process->write($this->command->image, $this->command->imageSize);
+ $process->closeStdin();
+ }
+ $output = $process->wait($timeout);
+
+ FriendlyErrors::checkCommandExecution($this->command, $output["out"], $output["err"]);
+ }
+ catch (TesseractOcrException $e) {
+ if ($this->command->useFileAsOutput) $this->cleanTempFiles();
+ throw $e;
+ }
+
+ if ($this->command->useFileAsOutput) {
+ $text = file_get_contents($this->command->getOutputFile());
+
+ if ($this->outputFile !== null) {
+ rename($this->command->getOutputFile(), $this->outputFile);
+ }
+
+ $this->cleanTempFiles();
+ }
+ else
+ $text = $output["out"];
+
+ return trim($text, " \t\n\r\0\x0A\x0B\x0C");
+ }
+
+ public function imageData($image, $size)
+ {
+ FriendlyErrors::checkTesseractVersion("3.03-rc1", "Reading image data from stdin", $this->command);
+ $this->command->useFileAsInput = false;
+ $this->command->image = $image;
+ $this->command->imageSize = $size;
+ return $this;
+ }
+
+ public function withoutTempFiles()
+ {
+ FriendlyErrors::checkTesseractVersion("3.03-rc1", "Writing to stdout (without using temp files)", $this->command);
+ $this->command->useFileAsOutput = false;
+ return $this;
+ }
+
+ public function image($image)
+ {
+ $this->command->image = $image;
+ return $this;
+ }
+
+ public function executable($executable)
+ {
+ FriendlyErrors::checkTesseractPresence($executable);
+ $this->command->executable = $executable;
+ return $this;
+ }
+
+ public function configFile($configFile)
+ {
+ $this->command->configFile = $configFile;
+ return $this;
+ }
+
+ public function tempDir($tempDir)
+ {
+ $this->command->tempDir = $tempDir;
+ return $this;
+ }
+
+ public function threadLimit($limit)
+ {
+ $this->command->threadLimit = $limit;
+ return $this;
+ }
+
+ // @deprecated
+ public function format($fmt) { return $this->configFile($fmt); }
+
+ public function setOutputFile($path) {
+ $this->outputFile = $path;
+ return $this;
+ }
+
+ public function allowlist()
+ {
+ $concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
+ $allowlist = join('', array_map($concat, func_get_args()));
+ $this->command->options[] = Option::config('tessedit_char_whitelist', $allowlist);
+ return $this;
+ }
+
+ public function whitelist()
+ {
+ $warningMsg = 'Notice: whitelist is deprecated, use allowlist instead.';
+ trigger_error($warningMsg, E_USER_NOTICE);
+
+ $concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
+ $allowlist = join('', array_map($concat, func_get_args()));
+ return $this->allowlist($allowlist);
+ }
+
+ public function version()
+ {
+ return $this->command->getTesseractVersion();
+ }
+
+ public function availableLanguages()
+ {
+ return $this->command->getAvailableLanguages();
+ }
+
+ public function __call($method, $args)
+ {
+ if ($this->isConfigFile($method)) return $this->configFile($method);
+ if ($this->isOption($method)) {
+ $option = $this->getOptionClassName().'::'.$method;
+ $this->command->options[] = call_user_func_array($option, $args);
+ return $this;
+ }
+ $arg = empty($args) ? null : $args[0];
+ $this->command->options[] = Option::config($method, $arg);
+ return $this;
+ }
+
+ private function isConfigFile($name)
+ {
+ return in_array($name, array('digits', 'hocr', 'pdf', 'quiet', 'tsv', 'txt'));
+ }
+
+ private function isOption($name)
+ {
+ return in_array($name, get_class_methods($this->getOptionClassName()));
+ }
+
+ private function getOptionClassName()
+ {
+ return __NAMESPACE__.'\\Option';
+ }
+
+ private function cleanTempFiles()
+ {
+ if (file_exists($this->command->getOutputFile(false))) {
+ unlink($this->command->getOutputFile(false));
+ }
+ if (file_exists($this->command->getOutputFile(true))) {
+ unlink($this->command->getOutputFile(true));
+ }
+ }
+}
--- /dev/null
+<?php
+
+namespace thiagoalessio\TesseractOCR;
+
+abstract class TesseractOcrException extends \Exception
+{
+}
--- /dev/null
+<?php
+
+namespace thiagoalessio\TesseractOCR;
+
+class UnsuccessfulCommandException extends TesseractOcrException
+{
+}