Decompress needed files while downloading them.
[quix0rs-apt-p2p.git] / apt_dht / MirrorManager.py
1
2 from bz2 import BZ2Decompressor
3 from zlib import decompressobj, MAX_WBITS
4 from gzip import FCOMMENT, FEXTRA, FHCRC, FNAME, FTEXT
5 from urlparse import urlparse
6 import os
7
8 from twisted.python import log, filepath
9 from twisted.internet import defer
10 from twisted.trial import unittest
11 from twisted.web2 import stream
12 from twisted.web2.http import splitHostPort
13
14 from AptPackages import AptPackages
15
16 aptpkg_dir='.apt-dht'
17
18 DECOMPRESS_EXTS = ['.gz', '.bz2']
19 DECOMPRESS_FILES = ['release', 'sources', 'packages']
20
21 class MirrorError(Exception):
22     """Exception raised when there's a problem with the mirror."""
23
24 class ProxyFileStream(stream.SimpleStream):
25     """Saves a stream to a file while providing a new stream."""
26     
27     def __init__(self, stream, outFile, decompress = None, decFile = None):
28         """Initializes the proxy.
29         
30         @type stream: C{twisted.web2.stream.IByteStream}
31         @param stream: the input stream to read from
32         @type outFile: C{twisted.python.filepath.FilePath}
33         @param outFile: the file to write to
34         @type decompress: C{string}
35         @param decompress: also decompress the file as this type
36             (currently only '.gz' and '.bz2' are supported)
37         @type decFile: C{twisted.python.filepath.FilePath}
38         @param decFile: the file to write the decompressed data to
39         """
40         self.stream = stream
41         self.outFile = outFile.open('w')
42         self.gzfile = None
43         self.bz2file = None
44         if decompress == ".gz":
45             self.gzheader = True
46             self.gzfile = decFile.open('w')
47             self.gzdec = decompressobj(-MAX_WBITS)
48         elif decompress == ".bz2":
49             self.bz2file = decFile.open('w')
50             self.bz2dec = BZ2Decompressor()
51         self.length = self.stream.length
52         self.start = 0
53         self.doneDefer = defer.Deferred()
54
55     def _done(self):
56         """Close the output file."""
57         if not self.outFile.closed:
58             self.outFile.close()
59             if self.gzfile:
60                 data_dec = self.gzdec.flush()
61                 self.gzfile.write(data_dec)
62                 self.gzfile.close()
63                 self.gzfile = None
64             if self.bz2file:
65                 self.bz2file.close()
66                 self.bz2file = None
67                 
68             self.doneDefer.callback(1)
69     
70     def read(self):
71         """Read some data from the stream."""
72         if self.outFile.closed:
73             return None
74         
75         data = self.stream.read()
76         if isinstance(data, defer.Deferred):
77             data.addCallbacks(self._write, self._done)
78             return data
79         
80         self._write(data)
81         return data
82     
83     def _write(self, data):
84         """Write the stream data to the file and return it for others to use."""
85         if data is None:
86             self._done()
87             return data
88         
89         self.outFile.write(data)
90         if self.gzfile:
91             if self.gzheader:
92                 self.gzheader = False
93                 new_data = self._remove_gzip_header(data)
94                 dec_data = self.gzdec.decompress(new_data)
95             else:
96                 dec_data = self.gzdec.decompress(data)
97             self.gzfile.write(dec_data)
98         if self.bz2file:
99             dec_data = self.bz2dec.decompress(data)
100             self.bz2file.write(dec_data)
101         return data
102     
103     def _remove_gzip_header(self, data):
104         if data[:2] != '\037\213':
105             raise IOError, 'Not a gzipped file'
106         if ord(data[2]) != 8:
107             raise IOError, 'Unknown compression method'
108         flag = ord(data[3])
109         # modtime = self.fileobj.read(4)
110         # extraflag = self.fileobj.read(1)
111         # os = self.fileobj.read(1)
112
113         skip = 10
114         if flag & FEXTRA:
115             # Read & discard the extra field, if present
116             xlen = ord(data[10])
117             xlen = xlen + 256*ord(data[11])
118             skip = skip + 2 + xlen
119         if flag & FNAME:
120             # Read and discard a null-terminated string containing the filename
121             while True:
122                 if not data[skip] or data[skip] == '\000':
123                     break
124                 skip += 1
125             skip += 1
126         if flag & FCOMMENT:
127             # Read and discard a null-terminated string containing a comment
128             while True:
129                 if not data[skip] or data[skip] == '\000':
130                     break
131                 skip += 1
132             skip += 1
133         if flag & FHCRC:
134             skip += 2     # Read & discard the 16-bit header CRC
135         return data[skip:]
136
137     def close(self):
138         """Clean everything up and return None to future reads."""
139         self.length = 0
140         self._done()
141         self.stream.close()
142
143 class MirrorManager:
144     """Manages all requests for mirror objects."""
145     
146     def __init__(self, cache_dir):
147         self.cache_dir = cache_dir
148         self.cache = filepath.FilePath(self.cache_dir)
149         self.apt_caches = {}
150     
151     def extractPath(self, url):
152         parsed = urlparse(url)
153         host, port = splitHostPort(parsed[0], parsed[1])
154         site = host + ":" + str(port)
155         path = parsed[2]
156             
157         i = max(path.rfind('/dists/'), path.rfind('/pool/'))
158         if i >= 0:
159             baseDir = path[:i]
160             path = path[i:]
161         else:
162             # Uh oh, this is not good
163             log.msg("Couldn't find a good base directory for path: %s" % (site + path))
164             baseDir = ''
165             if site in self.apt_caches:
166                 longest_match = 0
167                 for base in self.apt_caches[site]:
168                     base_match = ''
169                     for dirs in path.split('/'):
170                         if base.startswith(base_match + '/' + dirs):
171                             base_match += '/' + dirs
172                         else:
173                             break
174                     if len(base_match) > longest_match:
175                         longest_match = len(base_match)
176                         baseDir = base_match
177             log.msg("Settled on baseDir: %s" % baseDir)
178         
179         return site, baseDir, path
180         
181     def init(self, site, baseDir):
182         if site not in self.apt_caches:
183             self.apt_caches[site] = {}
184             
185         if baseDir not in self.apt_caches[site]:
186             site_cache = os.path.join(self.cache_dir, aptpkg_dir, 'mirrors', site + baseDir.replace('/', '_'))
187             self.apt_caches[site][baseDir] = AptPackages(site_cache)
188     
189     def findHash(self, url):
190         site, baseDir, path = self.extractPath(url)
191         if site in self.apt_caches and baseDir in self.apt_caches[site]:
192             return self.apt_caches[site][baseDir].findHash(path)
193         d = defer.Deferred()
194         d.errback(MirrorError("Site Not Found"))
195         return d
196
197     def save_file(self, response, hash, size, url):
198         """Save a downloaded file to the cache and stream it."""
199         log.msg('Returning file: %s' % url)
200         
201         parsed = urlparse(url)
202         destFile = self.cache.preauthChild(parsed[1] + parsed[2])
203         log.msg('Saving returned %r byte file to cache: %s' % (response.stream.length, destFile.path))
204         
205         if destFile.exists():
206             log.msg('File already exists, removing: %s' % destFile.path)
207             destFile.remove()
208         else:
209             destFile.parent().makedirs()
210             
211         root, ext = os.path.splitext(destFile.basename())
212         if root.lower() in DECOMPRESS_FILES and ext.lower() in DECOMPRESS_EXTS:
213             ext = ext.lower()
214             decFile = destFile.sibling(root)
215             log.msg('Decompressing to: %s' % decFile.path)
216             if decFile.exists():
217                 log.msg('File already exists, removing: %s' % decFile.path)
218                 decFile.remove()
219         else:
220             ext = None
221             decFile = None
222         
223         orig_stream = response.stream
224         response.stream = ProxyFileStream(orig_stream, destFile, ext, decFile)
225         response.stream.doneDefer.addCallback(self.save_complete, url, destFile,
226                                               response.headers.getHeader('Last-Modified'),
227                                               ext, decFile)
228         response.stream.doneDefer.addErrback(self.save_error, url)
229         return response
230
231     def save_complete(self, result, url, destFile, modtime = None, ext = None, decFile = None):
232         """Update the modification time and AptPackages."""
233         if modtime:
234             os.utime(destFile.path, (modtime, modtime))
235             if ext:
236                 os.utime(decFile.path, (modtime, modtime))
237             
238         site, baseDir, path = self.extractPath(url)
239         self.init(site, baseDir)
240         self.apt_caches[site][baseDir].file_updated(path, destFile.path)
241         if ext:
242             self.apt_caches[site][baseDir].file_updated(path[:-len(ext)], decFile.path)
243
244     def save_error(self, failure, url):
245         """An error has occurred in downloadign or saving the file."""
246         log.msg('Error occurred downloading %s' % url)
247         log.err(failure)
248         return failure
249
250 class TestMirrorManager(unittest.TestCase):
251     """Unit tests for the mirror manager."""
252     
253     timeout = 20
254     pending_calls = []
255     client = None
256     
257     def setUp(self):
258         self.client = MirrorManager('/tmp')
259         
260     def test_extractPath(self):
261         site, baseDir, path = self.client.extractPath('http://ftp.us.debian.org/debian/dists/unstable/Release')
262         self.failUnless(site == "ftp.us.debian.org:80", "no match: %s" % site)
263         self.failUnless(baseDir == "/debian", "no match: %s" % baseDir)
264         self.failUnless(path == "/dists/unstable/Release", "no match: %s" % path)
265
266         site, baseDir, path = self.client.extractPath('http://ftp.us.debian.org:16999/debian/pool/d/dpkg/dpkg_1.2.1-1.tar.gz')
267         self.failUnless(site == "ftp.us.debian.org:16999", "no match: %s" % site)
268         self.failUnless(baseDir == "/debian", "no match: %s" % baseDir)
269         self.failUnless(path == "/pool/d/dpkg/dpkg_1.2.1-1.tar.gz", "no match: %s" % path)
270
271         site, baseDir, path = self.client.extractPath('http://debian.camrdale.org/dists/unstable/Release')
272         self.failUnless(site == "debian.camrdale.org:80", "no match: %s" % site)
273         self.failUnless(baseDir == "", "no match: %s" % baseDir)
274         self.failUnless(path == "/dists/unstable/Release", "no match: %s" % path)
275
276     def verifyHash(self, found_hash, path, true_hash):
277         self.failUnless(found_hash[0] == true_hash, 
278                     "%s hashes don't match: %s != %s" % (path, found_hash[0], true_hash))
279
280     def test_findHash(self):
281         self.packagesFile = os.popen('ls -Sr /var/lib/apt/lists/ | grep -E "_main_.*Packages$" | tail -n 1').read().rstrip('\n')
282         self.sourcesFile = os.popen('ls -Sr /var/lib/apt/lists/ | grep -E "_main_.*Sources$" | tail -n 1').read().rstrip('\n')
283         for f in os.walk('/var/lib/apt/lists').next()[2]:
284             if f[-7:] == "Release" and self.packagesFile.startswith(f[:-7]):
285                 self.releaseFile = f
286                 break
287         
288         self.client.updatedFile('http://' + self.releaseFile.replace('_','/'), 
289                                 '/var/lib/apt/lists/' + self.releaseFile)
290         self.client.updatedFile('http://' + self.releaseFile[:self.releaseFile.find('_dists_')+1].replace('_','/') +
291                                 self.packagesFile[self.packagesFile.find('_dists_')+1:].replace('_','/'), 
292                                 '/var/lib/apt/lists/' + self.packagesFile)
293         self.client.updatedFile('http://' + self.releaseFile[:self.releaseFile.find('_dists_')+1].replace('_','/') +
294                                 self.sourcesFile[self.sourcesFile.find('_dists_')+1:].replace('_','/'), 
295                                 '/var/lib/apt/lists/' + self.sourcesFile)
296
297         lastDefer = defer.Deferred()
298         
299         idx_hash = os.popen('grep -A 3000 -E "^SHA1:" ' + 
300                             '/var/lib/apt/lists/' + self.releaseFile + 
301                             ' | grep -E " main/binary-i386/Packages.bz2$"'
302                             ' | head -n 1 | cut -d\  -f 2').read().rstrip('\n')
303         idx_path = 'http://' + self.releaseFile.replace('_','/')[:-7] + 'main/binary-i386/Packages.bz2'
304
305         d = self.client.findHash(idx_path)
306         d.addCallback(self.verifyHash, idx_path, idx_hash)
307
308         pkg_hash = os.popen('grep -A 30 -E "^Package: dpkg$" ' + 
309                             '/var/lib/apt/lists/' + self.packagesFile + 
310                             ' | grep -E "^SHA1:" | head -n 1' + 
311                             ' | cut -d\  -f 2').read().rstrip('\n')
312         pkg_path = 'http://' + self.releaseFile[:self.releaseFile.find('_dists_')+1].replace('_','/') + \
313                    os.popen('grep -A 30 -E "^Package: dpkg$" ' + 
314                             '/var/lib/apt/lists/' + self.packagesFile + 
315                             ' | grep -E "^Filename:" | head -n 1' + 
316                             ' | cut -d\  -f 2').read().rstrip('\n')
317
318         d = self.client.findHash(pkg_path)
319         d.addCallback(self.verifyHash, pkg_path, pkg_hash)
320
321         src_dir = os.popen('grep -A 30 -E "^Package: dpkg$" ' + 
322                             '/var/lib/apt/lists/' + self.sourcesFile + 
323                             ' | grep -E "^Directory:" | head -n 1' + 
324                             ' | cut -d\  -f 2').read().rstrip('\n')
325         src_hashes = os.popen('grep -A 20 -E "^Package: dpkg$" ' + 
326                             '/var/lib/apt/lists/' + self.sourcesFile + 
327                             ' | grep -A 4 -E "^Files:" | grep -E "^ " ' + 
328                             ' | cut -d\  -f 2').read().split('\n')[:-1]
329         src_paths = os.popen('grep -A 20 -E "^Package: dpkg$" ' + 
330                             '/var/lib/apt/lists/' + self.sourcesFile + 
331                             ' | grep -A 4 -E "^Files:" | grep -E "^ " ' + 
332                             ' | cut -d\  -f 4').read().split('\n')[:-1]
333
334         for i in range(len(src_hashes)):
335             src_path = 'http://' + self.releaseFile[:self.releaseFile.find('_dists_')+1].replace('_','/') + src_dir + '/' + src_paths[i]
336             d = self.client.findHash(src_path)
337             d.addCallback(self.verifyHash, src_path, src_hashes[i])
338             
339         idx_hash = os.popen('grep -A 3000 -E "^SHA1:" ' + 
340                             '/var/lib/apt/lists/' + self.releaseFile + 
341                             ' | grep -E " main/source/Sources.bz2$"'
342                             ' | head -n 1 | cut -d\  -f 2').read().rstrip('\n')
343         idx_path = 'http://' + self.releaseFile.replace('_','/')[:-7] + 'main/source/Sources.bz2'
344
345         d = self.client.findHash(idx_path)
346         d.addCallback(self.verifyHash, idx_path, idx_hash)
347
348         d.addBoth(lastDefer.callback)
349         return lastDefer
350
351     def tearDown(self):
352         for p in self.pending_calls:
353             if p.active():
354                 p.cancel()
355         self.client = None
356