Use the new DB in the main code.
[quix0rs-apt-p2p.git] / apt_dht / MirrorManager.py
1
2 from bz2 import BZ2Decompressor
3 from zlib import decompressobj, MAX_WBITS
4 from gzip import FCOMMENT, FEXTRA, FHCRC, FNAME, FTEXT
5 from urlparse import urlparse
6 import os
7
8 from twisted.python import log, filepath
9 from twisted.internet import defer
10 from twisted.trial import unittest
11 from twisted.web2 import stream
12 from twisted.web2.http import splitHostPort
13
14 from AptPackages import AptPackages
15
16 aptpkg_dir='.apt-dht'
17
18 DECOMPRESS_EXTS = ['.gz', '.bz2']
19 DECOMPRESS_FILES = ['release', 'sources', 'packages']
20
21 class MirrorError(Exception):
22     """Exception raised when there's a problem with the mirror."""
23
24 class ProxyFileStream(stream.SimpleStream):
25     """Saves a stream to a file while providing a new stream."""
26     
27     def __init__(self, stream, outFile, hash, decompress = None, decFile = None):
28         """Initializes the proxy.
29         
30         @type stream: C{twisted.web2.stream.IByteStream}
31         @param stream: the input stream to read from
32         @type outFile: C{twisted.python.filepath.FilePath}
33         @param outFile: the file to write to
34         @type hash: L{Hash.HashObject}
35         @param hash: the hash object to use for the file
36         @type decompress: C{string}
37         @param decompress: also decompress the file as this type
38             (currently only '.gz' and '.bz2' are supported)
39         @type decFile: C{twisted.python.filepath.FilePath}
40         @param decFile: the file to write the decompressed data to
41         """
42         self.stream = stream
43         self.outFile = outFile.open('w')
44         self.hash = hash
45         self.hash.new()
46         self.gzfile = None
47         self.bz2file = None
48         if decompress == ".gz":
49             self.gzheader = True
50             self.gzfile = decFile.open('w')
51             self.gzdec = decompressobj(-MAX_WBITS)
52         elif decompress == ".bz2":
53             self.bz2file = decFile.open('w')
54             self.bz2dec = BZ2Decompressor()
55         self.length = self.stream.length
56         self.start = 0
57         self.doneDefer = defer.Deferred()
58
59     def _done(self):
60         """Close the output file."""
61         if not self.outFile.closed:
62             self.outFile.close()
63             self.hash.digest()
64             if self.gzfile:
65                 data_dec = self.gzdec.flush()
66                 self.gzfile.write(data_dec)
67                 self.gzfile.close()
68                 self.gzfile = None
69             if self.bz2file:
70                 self.bz2file.close()
71                 self.bz2file = None
72                 
73             self.doneDefer.callback(self.hash)
74     
75     def read(self):
76         """Read some data from the stream."""
77         if self.outFile.closed:
78             return None
79         
80         data = self.stream.read()
81         if isinstance(data, defer.Deferred):
82             data.addCallbacks(self._write, self._done)
83             return data
84         
85         self._write(data)
86         return data
87     
88     def _write(self, data):
89         """Write the stream data to the file and return it for others to use."""
90         if data is None:
91             self._done()
92             return data
93         
94         self.outFile.write(data)
95         self.hash.update(data)
96         if self.gzfile:
97             if self.gzheader:
98                 self.gzheader = False
99                 new_data = self._remove_gzip_header(data)
100                 dec_data = self.gzdec.decompress(new_data)
101             else:
102                 dec_data = self.gzdec.decompress(data)
103             self.gzfile.write(dec_data)
104         if self.bz2file:
105             dec_data = self.bz2dec.decompress(data)
106             self.bz2file.write(dec_data)
107         return data
108     
109     def _remove_gzip_header(self, data):
110         if data[:2] != '\037\213':
111             raise IOError, 'Not a gzipped file'
112         if ord(data[2]) != 8:
113             raise IOError, 'Unknown compression method'
114         flag = ord(data[3])
115         # modtime = self.fileobj.read(4)
116         # extraflag = self.fileobj.read(1)
117         # os = self.fileobj.read(1)
118
119         skip = 10
120         if flag & FEXTRA:
121             # Read & discard the extra field, if present
122             xlen = ord(data[10])
123             xlen = xlen + 256*ord(data[11])
124             skip = skip + 2 + xlen
125         if flag & FNAME:
126             # Read and discard a null-terminated string containing the filename
127             while True:
128                 if not data[skip] or data[skip] == '\000':
129                     break
130                 skip += 1
131             skip += 1
132         if flag & FCOMMENT:
133             # Read and discard a null-terminated string containing a comment
134             while True:
135                 if not data[skip] or data[skip] == '\000':
136                     break
137                 skip += 1
138             skip += 1
139         if flag & FHCRC:
140             skip += 2     # Read & discard the 16-bit header CRC
141         return data[skip:]
142
143     def close(self):
144         """Clean everything up and return None to future reads."""
145         self.length = 0
146         self._done()
147         self.stream.close()
148
149 class MirrorManager:
150     """Manages all requests for mirror objects."""
151     
152     def __init__(self, cache_dir, manager = None):
153         self.manager = manager
154         self.cache_dir = cache_dir
155         self.cache = filepath.FilePath(self.cache_dir)
156         self.apt_caches = {}
157     
158     def extractPath(self, url):
159         parsed = urlparse(url)
160         host, port = splitHostPort(parsed[0], parsed[1])
161         site = host + ":" + str(port)
162         path = parsed[2]
163             
164         i = max(path.rfind('/dists/'), path.rfind('/pool/'))
165         if i >= 0:
166             baseDir = path[:i]
167             path = path[i:]
168         else:
169             # Uh oh, this is not good
170             log.msg("Couldn't find a good base directory for path: %s" % (site + path))
171             baseDir = ''
172             if site in self.apt_caches:
173                 longest_match = 0
174                 for base in self.apt_caches[site]:
175                     base_match = ''
176                     for dirs in path.split('/'):
177                         if base.startswith(base_match + '/' + dirs):
178                             base_match += '/' + dirs
179                         else:
180                             break
181                     if len(base_match) > longest_match:
182                         longest_match = len(base_match)
183                         baseDir = base_match
184             log.msg("Settled on baseDir: %s" % baseDir)
185         
186         return site, baseDir, path
187         
188     def init(self, site, baseDir):
189         if site not in self.apt_caches:
190             self.apt_caches[site] = {}
191             
192         if baseDir not in self.apt_caches[site]:
193             site_cache = os.path.join(self.cache_dir, aptpkg_dir, 'mirrors', site + baseDir.replace('/', '_'))
194             self.apt_caches[site][baseDir] = AptPackages(site_cache)
195     
196     def updatedFile(self, url, file_path):
197         site, baseDir, path = self.extractPath(url)
198         self.init(site, baseDir)
199         self.apt_caches[site][baseDir].file_updated(path, file_path)
200
201     def findHash(self, url):
202         site, baseDir, path = self.extractPath(url)
203         if site in self.apt_caches and baseDir in self.apt_caches[site]:
204             return self.apt_caches[site][baseDir].findHash(path)
205         d = defer.Deferred()
206         d.errback(MirrorError("Site Not Found"))
207         return d
208     
209     def save_file(self, response, hash, url):
210         """Save a downloaded file to the cache and stream it."""
211         if response.code != 200:
212             log.msg('File was not found (%r): %s' % (response, url))
213             return response
214         
215         log.msg('Returning file: %s' % url)
216         
217         parsed = urlparse(url)
218         destFile = self.cache.preauthChild(parsed[1] + parsed[2])
219         log.msg('Saving returned %r byte file to cache: %s' % (response.stream.length, destFile.path))
220         
221         if destFile.exists():
222             log.msg('File already exists, removing: %s' % destFile.path)
223             destFile.remove()
224         else:
225             destFile.parent().makedirs()
226             
227         root, ext = os.path.splitext(destFile.basename())
228         if root.lower() in DECOMPRESS_FILES and ext.lower() in DECOMPRESS_EXTS:
229             ext = ext.lower()
230             decFile = destFile.sibling(root)
231             log.msg('Decompressing to: %s' % decFile.path)
232             if decFile.exists():
233                 log.msg('File already exists, removing: %s' % decFile.path)
234                 decFile.remove()
235         else:
236             ext = None
237             decFile = None
238             
239         orig_stream = response.stream
240         response.stream = ProxyFileStream(orig_stream, destFile, hash, ext, decFile)
241         response.stream.doneDefer.addCallback(self.save_complete, url, destFile,
242                                               response.headers.getHeader('Last-Modified'),
243                                               ext, decFile)
244         response.stream.doneDefer.addErrback(self.save_error, url)
245         return response
246
247     def save_complete(self, hash, url, destFile, modtime = None, ext = None, decFile = None):
248         """Update the modification time and AptPackages."""
249         if modtime:
250             os.utime(destFile.path, (modtime, modtime))
251             if ext:
252                 os.utime(decFile.path, (modtime, modtime))
253         
254         result = hash.verify()
255         if result or result is None:
256             if result:
257                 log.msg('Hashes match: %s' % url)
258             else:
259                 log.msg('Hashed file to %s: %s' % (hash.hexdigest(), url))
260                 
261             self.updatedFile(url, destFile.path)
262             if ext:
263                 self.updatedFile(url[:-len(ext)], decFile.path)
264             
265             if self.manager:
266                 self.manager.cached_file(hash, url, destFile.path)
267         else:
268             log.msg("Hashes don't match %s != %s: %s" % (hash.hexexpected(), hash.hexdigest(), url))
269
270     def save_error(self, failure, url):
271         """An error has occurred in downloadign or saving the file."""
272         log.msg('Error occurred downloading %s' % url)
273         log.err(failure)
274         return failure
275
276 class TestMirrorManager(unittest.TestCase):
277     """Unit tests for the mirror manager."""
278     
279     timeout = 20
280     pending_calls = []
281     client = None
282     
283     def setUp(self):
284         self.client = MirrorManager('/tmp')
285         
286     def test_extractPath(self):
287         site, baseDir, path = self.client.extractPath('http://ftp.us.debian.org/debian/dists/unstable/Release')
288         self.failUnless(site == "ftp.us.debian.org:80", "no match: %s" % site)
289         self.failUnless(baseDir == "/debian", "no match: %s" % baseDir)
290         self.failUnless(path == "/dists/unstable/Release", "no match: %s" % path)
291
292         site, baseDir, path = self.client.extractPath('http://ftp.us.debian.org:16999/debian/pool/d/dpkg/dpkg_1.2.1-1.tar.gz')
293         self.failUnless(site == "ftp.us.debian.org:16999", "no match: %s" % site)
294         self.failUnless(baseDir == "/debian", "no match: %s" % baseDir)
295         self.failUnless(path == "/pool/d/dpkg/dpkg_1.2.1-1.tar.gz", "no match: %s" % path)
296
297         site, baseDir, path = self.client.extractPath('http://debian.camrdale.org/dists/unstable/Release')
298         self.failUnless(site == "debian.camrdale.org:80", "no match: %s" % site)
299         self.failUnless(baseDir == "", "no match: %s" % baseDir)
300         self.failUnless(path == "/dists/unstable/Release", "no match: %s" % path)
301
302     def verifyHash(self, found_hash, path, true_hash):
303         self.failUnless(found_hash.hexexpected() == true_hash, 
304                     "%s hashes don't match: %s != %s" % (path, found_hash.hexexpected(), true_hash))
305
306     def test_findHash(self):
307         self.packagesFile = os.popen('ls -Sr /var/lib/apt/lists/ | grep -E "_main_.*Packages$" | tail -n 1').read().rstrip('\n')
308         self.sourcesFile = os.popen('ls -Sr /var/lib/apt/lists/ | grep -E "_main_.*Sources$" | tail -n 1').read().rstrip('\n')
309         for f in os.walk('/var/lib/apt/lists').next()[2]:
310             if f[-7:] == "Release" and self.packagesFile.startswith(f[:-7]):
311                 self.releaseFile = f
312                 break
313         
314         self.client.updatedFile('http://' + self.releaseFile.replace('_','/'), 
315                                 '/var/lib/apt/lists/' + self.releaseFile)
316         self.client.updatedFile('http://' + self.releaseFile[:self.releaseFile.find('_dists_')+1].replace('_','/') +
317                                 self.packagesFile[self.packagesFile.find('_dists_')+1:].replace('_','/'), 
318                                 '/var/lib/apt/lists/' + self.packagesFile)
319         self.client.updatedFile('http://' + self.releaseFile[:self.releaseFile.find('_dists_')+1].replace('_','/') +
320                                 self.sourcesFile[self.sourcesFile.find('_dists_')+1:].replace('_','/'), 
321                                 '/var/lib/apt/lists/' + self.sourcesFile)
322
323         lastDefer = defer.Deferred()
324         
325         idx_hash = os.popen('grep -A 3000 -E "^SHA1:" ' + 
326                             '/var/lib/apt/lists/' + self.releaseFile + 
327                             ' | grep -E " main/binary-i386/Packages.bz2$"'
328                             ' | head -n 1 | cut -d\  -f 2').read().rstrip('\n')
329         idx_path = 'http://' + self.releaseFile.replace('_','/')[:-7] + 'main/binary-i386/Packages.bz2'
330
331         d = self.client.findHash(idx_path)
332         d.addCallback(self.verifyHash, idx_path, idx_hash)
333
334         pkg_hash = os.popen('grep -A 30 -E "^Package: dpkg$" ' + 
335                             '/var/lib/apt/lists/' + self.packagesFile + 
336                             ' | grep -E "^SHA1:" | head -n 1' + 
337                             ' | cut -d\  -f 2').read().rstrip('\n')
338         pkg_path = 'http://' + self.releaseFile[:self.releaseFile.find('_dists_')+1].replace('_','/') + \
339                    os.popen('grep -A 30 -E "^Package: dpkg$" ' + 
340                             '/var/lib/apt/lists/' + self.packagesFile + 
341                             ' | grep -E "^Filename:" | head -n 1' + 
342                             ' | cut -d\  -f 2').read().rstrip('\n')
343
344         d = self.client.findHash(pkg_path)
345         d.addCallback(self.verifyHash, pkg_path, pkg_hash)
346
347         src_dir = os.popen('grep -A 30 -E "^Package: dpkg$" ' + 
348                             '/var/lib/apt/lists/' + self.sourcesFile + 
349                             ' | grep -E "^Directory:" | head -n 1' + 
350                             ' | cut -d\  -f 2').read().rstrip('\n')
351         src_hashes = os.popen('grep -A 20 -E "^Package: dpkg$" ' + 
352                             '/var/lib/apt/lists/' + self.sourcesFile + 
353                             ' | grep -A 4 -E "^Files:" | grep -E "^ " ' + 
354                             ' | cut -d\  -f 2').read().split('\n')[:-1]
355         src_paths = os.popen('grep -A 20 -E "^Package: dpkg$" ' + 
356                             '/var/lib/apt/lists/' + self.sourcesFile + 
357                             ' | grep -A 4 -E "^Files:" | grep -E "^ " ' + 
358                             ' | cut -d\  -f 4').read().split('\n')[:-1]
359
360         for i in range(len(src_hashes)):
361             src_path = 'http://' + self.releaseFile[:self.releaseFile.find('_dists_')+1].replace('_','/') + src_dir + '/' + src_paths[i]
362             d = self.client.findHash(src_path)
363             d.addCallback(self.verifyHash, src_path, src_hashes[i])
364             
365         idx_hash = os.popen('grep -A 3000 -E "^SHA1:" ' + 
366                             '/var/lib/apt/lists/' + self.releaseFile + 
367                             ' | grep -E " main/source/Sources.bz2$"'
368                             ' | head -n 1 | cut -d\  -f 2').read().rstrip('\n')
369         idx_path = 'http://' + self.releaseFile.replace('_','/')[:-7] + 'main/source/Sources.bz2'
370
371         d = self.client.findHash(idx_path)
372         d.addCallback(self.verifyHash, idx_path, idx_hash)
373
374         d.addBoth(lastDefer.callback)
375         return lastDefer
376
377     def tearDown(self):
378         for p in self.pending_calls:
379             if p.active():
380                 p.cancel()
381         self.client = None
382