From: Mikael Nordfeldth Date: Tue, 24 Feb 2015 20:11:25 +0000 (+0100) Subject: Don't store duplicates of files. X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=325e784ccd555e03b84dbc62abaca3ab405c3a13;p=quix0rs-gnu-social.git Don't store duplicates of files. If a new file is uploaded, it will be matched with a previously uploaded file so we don't have to store duplicates. SHA256 is random enough and also unlikely enough to cause collisions. --- diff --git a/classes/File.php b/classes/File.php index 34cd4cdbc7..8fd040b7e9 100644 --- a/classes/File.php +++ b/classes/File.php @@ -28,6 +28,7 @@ class File extends Managed_DataObject public $id; // int(4) primary_key not_null public $urlhash; // varchar(64) unique_key public $url; // text + public $filehash; // varchar(64) indexed public $mimetype; // varchar(50) public $size; // int(4) public $title; // varchar(191) not 255 because utf8mb4 takes more space @@ -39,6 +40,7 @@ class File extends Managed_DataObject public $modified; // timestamp() not_null default_CURRENT_TIMESTAMP const URLHASH_ALG = 'sha256'; + const FILEHASH_ALG = 'sha256'; public static function schemaDef() { @@ -47,6 +49,7 @@ class File extends Managed_DataObject 'id' => array('type' => 'serial', 'not null' => true), 'urlhash' => array('type' => 'varchar', 'length' => 64, 'not null' => true, 'description' => 'sha256 of destination URL (url field)'), 'url' => array('type' => 'text', 'description' => 'destination URL after following possible redirections'), + 'filehash' => array('type' => 'varchar', 'length' => 64, 'not null' => false, 'description' => 'sha256 of the file contents, only for locally stored files of course'), 'mimetype' => array('type' => 'varchar', 'length' => 50, 'description' => 'mime type of resource'), 'size' => array('type' => 'int', 'description' => 'size of resource when available'), 'title' => array('type' => 'varchar', 'length' => 191, 'description' => 'title of resource when available'), @@ -62,6 +65,9 @@ class File extends Managed_DataObject 'unique keys' => array( 'file_urlhash_key' => array('urlhash'), ), + 'indexes' => array( + 'file_filehash_idx' => array('filehash'), + ), ); } @@ -247,12 +253,7 @@ class File extends Managed_DataObject static function filename(Profile $profile, $origname, $mimetype) { - try { - $ext = common_supported_mime_to_ext($mimetype); - } catch (Exception $e) { - // We don't support this mimetype, but let's guess the extension - $ext = substr(strrchr($mimetype, '/'), 1); - } + $ext = self::guessMimeExtension($mimetype); // Normalize and make the original filename more URL friendly. $origname = basename($origname, ".$ext"); @@ -273,6 +274,17 @@ class File extends Managed_DataObject return $filename; } + static function guessMimeExtension($mimetype) + { + try { + $ext = common_supported_mime_to_ext($mimetype); + } catch (Exception $e) { + // We don't support this mimetype, but let's guess the extension + $ext = substr(strrchr($mimetype, '/'), 1); + } + return strtolower($ext); + } + /** * Validation for as-saved base filenames */ @@ -464,7 +476,11 @@ class File extends Managed_DataObject public function getPath() { - return self::path($this->filename); + $filepath = self::path($this->filename); + if (!file_exists($filepath)) { + throw new FileNotFoundException($filepath); + } + return $filepath; } public function getUrl() @@ -494,6 +510,19 @@ class File extends Managed_DataObject return $file; } + /** + * @param string $hashstr String of (preferrably lower case) hexadecimal characters, same as result of 'hash_file(...)' + */ + static public function getByHash($hashstr, $alg=File::FILEHASH_ALG) + { + $file = new File(); + $file->filehash = strtolower($hashstr); + if (!$file->find(true)) { + throw new NoResultException($file); + } + return $file; + } + public function updateUrl($url) { $file = File::getKV('urlhash', self::hashurl($url)); diff --git a/classes/File_thumbnail.php b/classes/File_thumbnail.php index 609f1c34b8..acd488baba 100644 --- a/classes/File_thumbnail.php +++ b/classes/File_thumbnail.php @@ -119,7 +119,11 @@ class File_thumbnail extends Managed_DataObject public function getPath() { - return self::path($this->filename); + $filepath = self::path($this->filename); + if (!file_exists($filepath)) { + throw new FileNotFoundException($filepath); + } + return $filepath; } public function getUrl() diff --git a/lib/default.php b/lib/default.php index 9c88832b66..0e6ad7b9fd 100644 --- a/lib/default.php +++ b/lib/default.php @@ -253,6 +253,7 @@ $default = 'user_quota' => 50000000, 'monthly_quota' => 15000000, 'uploads' => true, + 'filename_base' => 'hash', // for new files, choose one: 'upload', 'hash' 'show_html' => false, // show (filtered) text/html attachments (and oEmbed HTML etc.). Doesn't affect AJAX calls. 'show_thumbs' => true, // show thumbnails in notice lists for uploaded images, and photos and videos linked remotely that provide oEmbed info 'process_links' => true, // check linked resources for embeddable photos and videos; this will hit referenced external web sites when processing new messages. diff --git a/lib/mediafile.php b/lib/mediafile.php index 131cc7dd17..bcc8662f56 100644 --- a/lib/mediafile.php +++ b/lib/mediafile.php @@ -42,12 +42,13 @@ class MediaFile var $short_fileurl = null; var $mimetype = null; - function __construct(Profile $scoped, $filename = null, $mimetype = null) + function __construct(Profile $scoped, $filename = null, $mimetype = null, $filehash = null) { $this->scoped = $scoped; $this->filename = $filename; $this->mimetype = $mimetype; + $this->filehash = $filehash; $this->fileRecord = $this->storeFile(); $this->fileurl = common_local_url('attachment', @@ -90,6 +91,24 @@ class MediaFile protected function storeFile() { + $filepath = File::path($this->filename); + if (!empty($this->filename) && $this->filehash === null) { + // Calculate if we have an older upload method somewhere (Qvitter) that + // doesn't do this before calling new MediaFile on its local files... + $this->filehash = hash_file(File::FILEHASH_ALG, $filepath); + if ($this->filehash === false) { + throw new ServerException('Could not read file for hashing'); + } + } + + try { + $file = File::getByHash($this->filehash); + // We're done here. Yes. Already. We assume sha256 won't collide on us anytime soon. + return $file; + } catch (NoResultException $e) { + // Well, let's just continue below. + } + $fileurl = File::url($this->filename); $file = new File; @@ -97,11 +116,15 @@ class MediaFile $file->filename = $this->filename; $file->urlhash = File::hashurl($fileurl); $file->url = $fileurl; - $filepath = File::path($this->filename); + $file->filehash = $this->filehash; $file->size = filesize($filepath); + if ($file->size === false) { + throw new ServerException('Could not read file to get its size'); + } $file->date = time(); $file->mimetype = $this->mimetype; + $file_id = $file->insert(); if ($file_id===false) { @@ -206,49 +229,86 @@ class MediaFile throw new ClientException(_('System error uploading file.')); } - // Throws exception if additional size does not respect quota - File::respectsQuota($scoped, $_FILES[$param]['size']); + // TODO: Make documentation clearer that this won't work for files >2GiB because + // PHP is stupid in its 32bit head. But noone accepts 2GiB files with PHP + // anyway... I hope. + $filehash = hash_file(File::FILEHASH_ALG, $_FILES[$param]['tmp_name']); + + try { + $file = File::getByHash($filehash); + // If no exception is thrown the file exists locally, so we'll use that and just add redirections. + $filename = $file->filename; + $mimetype = $file->mimetype; + + } catch (NoResultException $e) { + // We have to save the upload as a new local file. This is the normal course of action. - $mimetype = self::getUploadedMimeType($_FILES[$param]['tmp_name'], - $_FILES[$param]['name']); + // Throws exception if additional size does not respect quota + // This test is only needed, of course, if we're uploading something new. + File::respectsQuota($scoped, $_FILES[$param]['size']); - $basename = basename($_FILES[$param]['name']); - $filename = File::filename($scoped, $basename, $mimetype); - $filepath = File::path($filename); + $mimetype = self::getUploadedMimeType($_FILES[$param]['tmp_name'], $_FILES[$param]['name']); - $result = move_uploaded_file($_FILES[$param]['tmp_name'], $filepath); + switch (common_config('attachments', 'filename_base')) { + case 'upload': + $basename = basename($_FILES[$param]['name']); + $filename = File::filename($scoped, $basename, $mimetype); + break; + case 'hash': + default: + $filename = strtolower($filehash) . '.' . File::guessMimeExtension($mimetype); + } + $filepath = File::path($filename); - if (!$result) { - // TRANS: Client exception thrown when a file upload operation fails because the file could - // TRANS: not be moved from the temporary folder to the permanent file location. - throw new ClientException(_('File could not be moved to destination directory.')); + $result = move_uploaded_file($_FILES[$param]['tmp_name'], $filepath); + + if (!$result) { + // TRANS: Client exception thrown when a file upload operation fails because the file could + // TRANS: not be moved from the temporary folder to the permanent file location. + throw new ClientException(_('File could not be moved to destination directory.')); + } } - return new MediaFile($scoped, $filename, $mimetype); + return new MediaFile($scoped, $filename, $mimetype, $filehash); } static function fromFilehandle($fh, Profile $scoped) { - $stream = stream_get_meta_data($fh); + // So far we're only handling filehandles originating from tmpfile(), + // so we can always do hash_file on $stream['uri'] as far as I can tell! + $filehash = hash_file(File::FILEHASH_ALG, $stream['uri']); - File::respectsQuota($scoped, filesize($stream['uri'])); - - $mimetype = self::getUploadedMimeType($stream['uri']); + try { + $file = File::getByHash($filehash); + // Already have it, so let's reuse the locally stored File + $filename = $file->filename; + $mimetype = $file->mimetype; + } catch (NoResultException $e) { + File::respectsQuota($scoped, filesize($stream['uri'])); - $filename = File::filename($scoped, "email", $mimetype); + $mimetype = self::getUploadedMimeType($stream['uri']); - $filepath = File::path($filename); + switch (common_config('attachments', 'filename_base')) { + case 'upload': + $filename = File::filename($scoped, "email", $mimetype); + break; + case 'hash': + default: + $filename = strtolower($filehash) . '.' . File::guessMimeExtension($mimetype); + } + $filepath = File::path($filename); - $result = copy($stream['uri'], $filepath) && chmod($filepath, 0664); + $result = copy($stream['uri'], $filepath) && chmod($filepath, 0664); - if (!$result) { - // TRANS: Client exception thrown when a file upload operation fails because the file could - // TRANS: not be moved from the temporary folder to the permanent file location. - throw new ClientException(_('File could not be moved to destination directory.' . - $stream['uri'] . ' ' . $filepath)); + if (!$result) { + // TRANS: Client exception thrown when a file upload operation fails because the file could + // TRANS: not be moved from the temporary folder to the permanent file location. + throw new ClientException(_('File could not be moved to destination directory.' . + $stream['uri'] . ' ' . $filepath)); + } } - return new MediaFile($scoped, $filename, $mimetype); + return new MediaFile($scoped, $filename, $mimetype, $filehash); } /** diff --git a/scripts/upgrade.php b/scripts/upgrade.php index c221a495af..692eaac17a 100644 --- a/scripts/upgrade.php +++ b/scripts/upgrade.php @@ -48,6 +48,7 @@ function main() fixupFileGeometry(); deleteLocalFileThumbnailsWithoutFilename(); deleteMissingLocalFileThumbnails(); + setFilehashOnLocalFiles(); initGroupProfileId(); initLocalGroup(); @@ -490,7 +491,9 @@ function deleteMissingLocalFileThumbnails() // Checking if there were any File_thumbnail entries without filename if ($thumbs->find()) { while ($thumbs->fetch()) { - if (!file_exists(File_thumbnail::path($thumbs->filename))) { + try { + $thumbs->getPath(); + } catch (FileNotFoundException $e) { $thumbs->delete(); } } @@ -499,4 +502,30 @@ function deleteMissingLocalFileThumbnails() printfnq("DONE.\n"); } +/* + * Files are now stored with their hash, so let's generate for previously uploaded files. + */ +function setFilehashOnLocalFiles() +{ + printfnq('Ensuring all local files have the filehash field set...'); + + $file = new File(); + $file->whereAdd('filename IS NOT NULL'); // local files + $file->whereAdd('filehash IS NULL', 'AND'); // without filehash value + + if ($file->find()) { + while ($file->fetch()) { + try { + $orig = clone($file); + $file->filehash = hash_file(File::FILEHASH_ALG, $file->getPath()); + $file->update($orig); + } catch (FileNotFoundException $e) { + echo "\n WARNING: file ID {$file->id} does not exist on path '{$e->path}'. Clean up the file table?"; + } + } + } + + printfnq("DONE.\n"); +} + main();