2 from bz2 import BZ2Decompressor
3 from zlib import decompressobj, MAX_WBITS
4 from gzip import FCOMMENT, FEXTRA, FHCRC, FNAME, FTEXT
5 from urlparse import urlparse
8 from twisted.python import log, filepath
9 from twisted.internet import defer
10 from twisted.trial import unittest
11 from twisted.web2 import stream
12 from twisted.web2.http import splitHostPort
14 from AptPackages import AptPackages
18 DECOMPRESS_EXTS = ['.gz', '.bz2']
19 DECOMPRESS_FILES = ['release', 'sources', 'packages']
21 class MirrorError(Exception):
22 """Exception raised when there's a problem with the mirror."""
24 class ProxyFileStream(stream.SimpleStream):
25 """Saves a stream to a file while providing a new stream."""
27 def __init__(self, stream, outFile, hash, decompress = None, decFile = None):
28 """Initializes the proxy.
30 @type stream: C{twisted.web2.stream.IByteStream}
31 @param stream: the input stream to read from
32 @type outFile: C{twisted.python.filepath.FilePath}
33 @param outFile: the file to write to
34 @type hash: L{Hash.HashObject}
35 @param hash: the hash object to use for the file
36 @type decompress: C{string}
37 @param decompress: also decompress the file as this type
38 (currently only '.gz' and '.bz2' are supported)
39 @type decFile: C{twisted.python.filepath.FilePath}
40 @param decFile: the file to write the decompressed data to
43 self.outFile = outFile.open('w')
48 if decompress == ".gz":
50 self.gzfile = decFile.open('w')
51 self.gzdec = decompressobj(-MAX_WBITS)
52 elif decompress == ".bz2":
53 self.bz2file = decFile.open('w')
54 self.bz2dec = BZ2Decompressor()
55 self.length = self.stream.length
57 self.doneDefer = defer.Deferred()
60 """Close the output file."""
61 if not self.outFile.closed:
65 data_dec = self.gzdec.flush()
66 self.gzfile.write(data_dec)
73 self.doneDefer.callback(self.hash)
76 """Read some data from the stream."""
77 if self.outFile.closed:
80 data = self.stream.read()
81 if isinstance(data, defer.Deferred):
82 data.addCallbacks(self._write, self._done)
88 def _write(self, data):
89 """Write the stream data to the file and return it for others to use."""
94 self.outFile.write(data)
95 self.hash.update(data)
99 new_data = self._remove_gzip_header(data)
100 dec_data = self.gzdec.decompress(new_data)
102 dec_data = self.gzdec.decompress(data)
103 self.gzfile.write(dec_data)
105 dec_data = self.bz2dec.decompress(data)
106 self.bz2file.write(dec_data)
109 def _remove_gzip_header(self, data):
110 if data[:2] != '\037\213':
111 raise IOError, 'Not a gzipped file'
112 if ord(data[2]) != 8:
113 raise IOError, 'Unknown compression method'
115 # modtime = self.fileobj.read(4)
116 # extraflag = self.fileobj.read(1)
117 # os = self.fileobj.read(1)
121 # Read & discard the extra field, if present
123 xlen = xlen + 256*ord(data[11])
124 skip = skip + 2 + xlen
126 # Read and discard a null-terminated string containing the filename
128 if not data[skip] or data[skip] == '\000':
133 # Read and discard a null-terminated string containing a comment
135 if not data[skip] or data[skip] == '\000':
140 skip += 2 # Read & discard the 16-bit header CRC
144 """Clean everything up and return None to future reads."""
150 """Manages all requests for mirror objects."""
152 def __init__(self, cache_dir, manager = None):
153 self.manager = manager
154 self.cache_dir = cache_dir
155 self.cache = filepath.FilePath(self.cache_dir)
158 def extractPath(self, url):
159 parsed = urlparse(url)
160 host, port = splitHostPort(parsed[0], parsed[1])
161 site = host + ":" + str(port)
164 i = max(path.rfind('/dists/'), path.rfind('/pool/'))
169 # Uh oh, this is not good
170 log.msg("Couldn't find a good base directory for path: %s" % (site + path))
172 if site in self.apt_caches:
174 for base in self.apt_caches[site]:
176 for dirs in path.split('/'):
177 if base.startswith(base_match + '/' + dirs):
178 base_match += '/' + dirs
181 if len(base_match) > longest_match:
182 longest_match = len(base_match)
184 log.msg("Settled on baseDir: %s" % baseDir)
186 return site, baseDir, path
188 def init(self, site, baseDir):
189 if site not in self.apt_caches:
190 self.apt_caches[site] = {}
192 if baseDir not in self.apt_caches[site]:
193 site_cache = os.path.join(self.cache_dir, aptpkg_dir, 'mirrors', site + baseDir.replace('/', '_'))
194 self.apt_caches[site][baseDir] = AptPackages(site_cache)
196 def updatedFile(self, url, file_path):
197 site, baseDir, path = self.extractPath(url)
198 self.init(site, baseDir)
199 self.apt_caches[site][baseDir].file_updated(path, file_path)
201 def findHash(self, url):
202 site, baseDir, path = self.extractPath(url)
203 if site in self.apt_caches and baseDir in self.apt_caches[site]:
204 return self.apt_caches[site][baseDir].findHash(path)
206 d.errback(MirrorError("Site Not Found"))
209 def save_file(self, response, hash, url):
210 """Save a downloaded file to the cache and stream it."""
211 if response.code != 200:
212 log.msg('File was not found (%r): %s' % (response, url))
215 log.msg('Returning file: %s' % url)
217 parsed = urlparse(url)
218 destFile = self.cache.preauthChild(parsed[1] + parsed[2])
219 log.msg('Saving returned %r byte file to cache: %s' % (response.stream.length, destFile.path))
221 if destFile.exists():
222 log.msg('File already exists, removing: %s' % destFile.path)
225 destFile.parent().makedirs()
227 root, ext = os.path.splitext(destFile.basename())
228 if root.lower() in DECOMPRESS_FILES and ext.lower() in DECOMPRESS_EXTS:
230 decFile = destFile.sibling(root)
231 log.msg('Decompressing to: %s' % decFile.path)
233 log.msg('File already exists, removing: %s' % decFile.path)
239 orig_stream = response.stream
240 response.stream = ProxyFileStream(orig_stream, destFile, hash, ext, decFile)
241 response.stream.doneDefer.addCallback(self.save_complete, url, destFile,
242 response.headers.getHeader('Last-Modified'),
244 response.stream.doneDefer.addErrback(self.save_error, url)
247 def save_complete(self, hash, url, destFile, modtime = None, ext = None, decFile = None):
248 """Update the modification time and AptPackages."""
250 os.utime(destFile.path, (modtime, modtime))
252 os.utime(decFile.path, (modtime, modtime))
254 result = hash.verify()
255 if result or result is None:
257 log.msg('Hashes match: %s' % url)
259 log.msg('Hashed file to %s: %s' % (hash.hexdigest(), url))
261 self.updatedFile(url, destFile.path)
263 self.updatedFile(url[:-len(ext)], decFile.path)
266 self.manager.cached_file(hash, url, destFile.path)
268 log.msg("Hashes don't match %s != %s: %s" % (hash.hexexpected(), hash.hexdigest(), url))
270 def save_error(self, failure, url):
271 """An error has occurred in downloadign or saving the file."""
272 log.msg('Error occurred downloading %s' % url)
276 class TestMirrorManager(unittest.TestCase):
277 """Unit tests for the mirror manager."""
284 self.client = MirrorManager('/tmp')
286 def test_extractPath(self):
287 site, baseDir, path = self.client.extractPath('http://ftp.us.debian.org/debian/dists/unstable/Release')
288 self.failUnless(site == "ftp.us.debian.org:80", "no match: %s" % site)
289 self.failUnless(baseDir == "/debian", "no match: %s" % baseDir)
290 self.failUnless(path == "/dists/unstable/Release", "no match: %s" % path)
292 site, baseDir, path = self.client.extractPath('http://ftp.us.debian.org:16999/debian/pool/d/dpkg/dpkg_1.2.1-1.tar.gz')
293 self.failUnless(site == "ftp.us.debian.org:16999", "no match: %s" % site)
294 self.failUnless(baseDir == "/debian", "no match: %s" % baseDir)
295 self.failUnless(path == "/pool/d/dpkg/dpkg_1.2.1-1.tar.gz", "no match: %s" % path)
297 site, baseDir, path = self.client.extractPath('http://debian.camrdale.org/dists/unstable/Release')
298 self.failUnless(site == "debian.camrdale.org:80", "no match: %s" % site)
299 self.failUnless(baseDir == "", "no match: %s" % baseDir)
300 self.failUnless(path == "/dists/unstable/Release", "no match: %s" % path)
302 def verifyHash(self, found_hash, path, true_hash):
303 self.failUnless(found_hash.hexexpected() == true_hash,
304 "%s hashes don't match: %s != %s" % (path, found_hash.hexexpected(), true_hash))
306 def test_findHash(self):
307 self.packagesFile = os.popen('ls -Sr /var/lib/apt/lists/ | grep -E "_main_.*Packages$" | tail -n 1').read().rstrip('\n')
308 self.sourcesFile = os.popen('ls -Sr /var/lib/apt/lists/ | grep -E "_main_.*Sources$" | tail -n 1').read().rstrip('\n')
309 for f in os.walk('/var/lib/apt/lists').next()[2]:
310 if f[-7:] == "Release" and self.packagesFile.startswith(f[:-7]):
314 self.client.updatedFile('http://' + self.releaseFile.replace('_','/'),
315 '/var/lib/apt/lists/' + self.releaseFile)
316 self.client.updatedFile('http://' + self.releaseFile[:self.releaseFile.find('_dists_')+1].replace('_','/') +
317 self.packagesFile[self.packagesFile.find('_dists_')+1:].replace('_','/'),
318 '/var/lib/apt/lists/' + self.packagesFile)
319 self.client.updatedFile('http://' + self.releaseFile[:self.releaseFile.find('_dists_')+1].replace('_','/') +
320 self.sourcesFile[self.sourcesFile.find('_dists_')+1:].replace('_','/'),
321 '/var/lib/apt/lists/' + self.sourcesFile)
323 lastDefer = defer.Deferred()
325 idx_hash = os.popen('grep -A 3000 -E "^SHA1:" ' +
326 '/var/lib/apt/lists/' + self.releaseFile +
327 ' | grep -E " main/binary-i386/Packages.bz2$"'
328 ' | head -n 1 | cut -d\ -f 2').read().rstrip('\n')
329 idx_path = 'http://' + self.releaseFile.replace('_','/')[:-7] + 'main/binary-i386/Packages.bz2'
331 d = self.client.findHash(idx_path)
332 d.addCallback(self.verifyHash, idx_path, idx_hash)
334 pkg_hash = os.popen('grep -A 30 -E "^Package: dpkg$" ' +
335 '/var/lib/apt/lists/' + self.packagesFile +
336 ' | grep -E "^SHA1:" | head -n 1' +
337 ' | cut -d\ -f 2').read().rstrip('\n')
338 pkg_path = 'http://' + self.releaseFile[:self.releaseFile.find('_dists_')+1].replace('_','/') + \
339 os.popen('grep -A 30 -E "^Package: dpkg$" ' +
340 '/var/lib/apt/lists/' + self.packagesFile +
341 ' | grep -E "^Filename:" | head -n 1' +
342 ' | cut -d\ -f 2').read().rstrip('\n')
344 d = self.client.findHash(pkg_path)
345 d.addCallback(self.verifyHash, pkg_path, pkg_hash)
347 src_dir = os.popen('grep -A 30 -E "^Package: dpkg$" ' +
348 '/var/lib/apt/lists/' + self.sourcesFile +
349 ' | grep -E "^Directory:" | head -n 1' +
350 ' | cut -d\ -f 2').read().rstrip('\n')
351 src_hashes = os.popen('grep -A 20 -E "^Package: dpkg$" ' +
352 '/var/lib/apt/lists/' + self.sourcesFile +
353 ' | grep -A 4 -E "^Files:" | grep -E "^ " ' +
354 ' | cut -d\ -f 2').read().split('\n')[:-1]
355 src_paths = os.popen('grep -A 20 -E "^Package: dpkg$" ' +
356 '/var/lib/apt/lists/' + self.sourcesFile +
357 ' | grep -A 4 -E "^Files:" | grep -E "^ " ' +
358 ' | cut -d\ -f 4').read().split('\n')[:-1]
360 for i in range(len(src_hashes)):
361 src_path = 'http://' + self.releaseFile[:self.releaseFile.find('_dists_')+1].replace('_','/') + src_dir + '/' + src_paths[i]
362 d = self.client.findHash(src_path)
363 d.addCallback(self.verifyHash, src_path, src_hashes[i])
365 idx_hash = os.popen('grep -A 3000 -E "^SHA1:" ' +
366 '/var/lib/apt/lists/' + self.releaseFile +
367 ' | grep -E " main/source/Sources.bz2$"'
368 ' | head -n 1 | cut -d\ -f 2').read().rstrip('\n')
369 idx_path = 'http://' + self.releaseFile.replace('_','/')[:-7] + 'main/source/Sources.bz2'
371 d = self.client.findHash(idx_path)
372 d.addCallback(self.verifyHash, idx_path, idx_hash)
374 d.addBoth(lastDefer.callback)
378 for p in self.pending_calls: