Pyhttpxtract
Jump to navigation
Jump to search
pyHttpXtract.py
Objectives
This script aims at decoding HTTP conversations from a pcap file. It displays each request and response, and for these latests, it decodes the eventual attachements (text/xml, image/gif, image/jpeg), even if they are compressed with gzip.
Gzip is specified through RFC1952. Applied to our example, we recognize a gzip signature (1f8b) on frame #45:
746e 6572 3a20 6f72 6967 696e 2e30 0d0a tner: origin.0.. 0d0a 1f8b 0800 0000 0000 0000 edbd e976 ...............v dc38 9a2d fabf 9f02 95f7 56af acb5 2c81 .8.-......V...,.
According to RFC1952, we can decompose bits as follows:
+---+---+---+---+---+---+---+---+---+---+ |ID1|ID2|CM |FLG| MTIME |XFL|OS | +---+---+---+---+---+---+---+---+---+---+ |1f |8b |08 |00 |00 00 00 00 |00 |00 | +---+---+---+---+---+---+---+---+---+---+
We can identify:
+-----+----+------------------------------------------------------------------------------+ | ID1 | 1f | Fixed value of 31 (0x1f, \037) composing first part of gzip magic number | +-----+----+------------------------------------------------------------------------------+ | ID2 | 8b | Fixed value of 139 (0x8b, \213) composing second part of gzip magic number | +-----+----+------------------------------------------------------------------------------+ | CM | 08 | Compression method. "This identifies the compression method used in the file.| | | | CM = 0-7 are reserved. CM = 8 denotes the "deflate" compression method, which| | | | is the one customarily used by gzip and which is documented elsewhere". Since| | | | the value here is 8, we know we have to deal with "deflate" format. | +-----+----+------------------------------------------------------------------------------+ | FLG | | all values are 0 in our capture. This flag byte is divided into individual | | | | bits as follows: | | | | bit 0 FTEXT | | | | bit 1 FHCRC | | | | bit 2 FEXTRA | | | | bit 3 FNAME | | | | bit 4 FCOMMENT | | | | bit 5 reserved | | | | bit 6 reserved | | | | bit 7 reserved | +-----+----+------------------------------------------------------------------------------+ |MTIME| 00 | | +-----+----+------------------------------------------------------------------------------+ | XFL | 00 | | +-----+----+------------------------------------------------------------------------------+ | OS | 00 | OS : | | | | 0 - FAT filesystem (MS-DOS, OS/2, NT/Win32) | | | | 1 - Amiga | | | | 2 - VMS (or OpenVMS) | | | | 3 - Unix | | | | 4 - VM/CMS | | | | 5 - Atari TOS | | | | 6 - HPFS filesystem (OS/2, NT) | | | | 7 - Macintosh | | | | 8 - Z-System | | | | 9 - CP/M | | | | 10 - TOPS-20 | | | | 11 - NTFS filesystem (NT) | | | | 12 - QDOS | | | | 13 - Acorn RISCOS | | | | 255 - unknown | +-----+----+------------------------------------------------------------------------------+
Script limitations
- Doesn't check input file type (pcap)
- Only recognizes text/xml, image/gif, image/jpeg
- Only works "off line" (use of pcapy.open_offline)
- Tested for Linux Debian 5 only (should be compatible with other systems but untested)
- Script is slow due to use of chardet python's module
Script content
This project is supported on Google code. Please report any eventual bug here.
#!/usr/bin/env python from optparse import OptionParser import sys import shutil import pcapy import impacket.ImpactDecoder as Decoders import impacket.ImpactPacket as Packets import os.path import os import chardet import string import gzip class pcapGzip: def __init__(self, pcapfile, reportpath="./report"): assert pcapfile if not os.path.exists(pcapfile): raise TypeError("Pcap file not found. Please check location.") self.reportpath = reportpath if not os.path.exists(self.reportpath): os.makedirs(self.reportpath) self.pcapfile = pcapfile def uncompressGzip(self, file): """Gunzip a gz file """ try: r_file = gzip.GzipFile(file, 'r') write_file = string.rstrip(file, '.gz') w_file = open(write_file, 'w') w_file.write(r_file.read()) w_file.close() r_file.close() os.unlink(file) print "Successfully uncompressed %s" % (file) except: print "***Error: Failed to uncompress %s" % (file) def tagFiles(self): """ Browses a given dir and tries to uncompress gz files """ listDir = os.listdir("report") for f in listDir: fullpath = os.path.join(self.reportpath, f) # full path without gz extension if open(fullpath, 'r').read(2)=='\037\213': # magic number for application/x-gzip os.rename(fullpath, fullpath+".gz") # first give gz extension to gz files self.uncompressGzip(fullpath+".gz") # then uncompress gz files def decodePayload(self, payload): """Decode a payload from the parser and returns an array of lines """ decoder = Decoders.EthDecoder() eth = decoder.decode(payload) ip = eth.child() tcp = ip.child() try: if tcp.get_RST()!=1: data = tcp.get_data_as_string() # raw data data = data.replace('\r\n', '\r\n###~~~###') arrline = data.split('\r\n') return arrline else: return None except: return None def writeFile(self, f, content): """Dump content in a file """ obFile = open(os.path.join(self.reportpath, f), 'a') obFile.write(content) obFile.close() def decodeMac(self, mac): """Decode mac address """ m = '' for i in mac: t = "%x" % i if len(t)==1: t = '0'+t m=m+":"+t return m[1:] def createFlows(self): """Create necessary flows based on pcap file """ print "running..." self.writeFile("report.html", '<html>' + '<head><style>td { font-size:8pt; }</style></head>' + '<body><table border="1" style="width:1000px"><tr>' + '<th style="width:100px">Num.</th>' + '<th style="width:200px">Flow</th>' + '<th style="width:600px;word-wrap:true">Request/Response</th>' + '<th style="width:100px">Attachment</th>' + '</tr>') reader = pcapy.open_offline(self.pcapfile) eth_decoder = Decoders.EthDecoder() ip_decoder = Decoders.IPDecoder() tcp_decoder = Decoders.TCPDecoder() countPacket = 0 lastAttach = '' ext = '' (header, payload) = reader.next() while payload!='': # no other way to stop pcapy loop? countPacket+=1 try: if countPacket%100==0: print "(%d packets already processed)" % countPacket arrline = self.decodePayload(payload) # If TCP flag RST, we skip the packet if arrline: ethernet = eth_decoder.decode(payload) smac = self.decodeMac(ethernet.get_ether_shost()) dmac = self.decodeMac(ethernet.get_ether_dhost()) if ethernet.get_ether_type() == Packets.IP.ethertype: # if IP packet ip = ip_decoder.decode(payload[ethernet.get_header_size():]) if ip.get_ip_p() == Packets.TCP.protocol: # if TCP packet tcp = tcp_decoder.decode( payload[ethernet.get_header_size()+ip.get_header_size():]) ipsrc = ip.get_ip_src() ipdst = ip.get_ip_dst() sport = tcp.get_th_sport() dport = tcp.get_th_dport() sessionFile = "session-"+ipsrc+"."+str(sport)+"-"+ipdst+"."+str(dport) flow = ipsrc + ':' + str(sport) + '<br />(' + smac + ')' + '<br />-><br />' + ipdst + ':' + str(dport) + '<br />(' + dmac + ')' for line in arrline: if line.strip() != "": if chardet.detect(line)['encoding'] == 'ascii': line = line.replace('###~~~###', '') if line.startswith("GET ") or line.startswith("HTTP/"): if line.startswith("HTTP/"): # new file packetnum = countPacket self.writeFile("report.html", '<td> </td>') self.writeFile("report.html", '<tr><td>'+str(countPacket)+'</td>') self.writeFile("report.html", '<td>'+flow+'</td><td>') if line.startswith("Content-Type"): style = ' style="background:#ffff00"' ext = '.'+line.split("/")[1].split(";")[0] if ext == '.gzip': ext = '.gz' else: style = '' self.writeFile("report.html", '<div'+style+'>'+line+'</div>') else: # raw data if sessionFile + "-" + str(packetnum) + ext != lastAttach: # New file line = line.replace('###~~~###', '') lastAttach = sessionFile + "-" + str(packetnum) + ext self.writeFile("report.html",'</td><td align="center"><a href="' + sessionFile + "-" + str(packetnum) + ext + '">') if ext==".jpeg" or ext==".gif": self.writeFile("report.html",'<img src="' + sessionFile + "-" + str(packetnum) + ext + '" border="2" style="width:100px;" />') else: self.writeFile("report.html",'<div style="background:#ff0000;color:#fff;font-weight:bold;width:50px;text-align:center">' + ext[1:] + '</div>') self.writeFile("report.html", '</a></td></tr>') else: line = line.replace('###~~~###', '\r\n') # Content of the file self.writeFile(sessionFile + "-" + str(packetnum) + ext, line) # raw data (header, payload) = reader.next() except: break print "\n%d have been detected in this pcap file" % countPacket self.writeFile("report.html", "</table>\n%d have been detected in this pcap file</body></html>" % countPacket) if __name__ == '__main__': usage = "usage: %prog -r <file> [options]" parser = OptionParser(usage) parser.add_option("-r", "--read-file", dest="pcapfile", help="Capture file to process (pcap format)") parser.add_option("-o", '--output', dest="output_directory", default="./report", help="Reporting directory (default: ./report/)") parser.add_option("-f", '--force', dest="force", default=False, action="store_true", help="Force overwriting of files") (options, args) = parser.parse_args(sys.argv) if not options.pcapfile: parser.error("Capture file is missing. Use -r <file>.") if options.output_directory and os.path.isfile(options.output_directory): parser.error("Use a different name for output directory since it is already used for a file") if options.output_directory and os.path.isdir(options.output_directory) and not options.force: if os.listdir(options.output_directory): parser.error("Output directory is not empty. Use -f to overwrite content") if options.force: shutil.rmtree(options.output_directory, ignore_errors=True) p = pcapGzip(options.pcapfile, options.output_directory) p.createFlows() p.tagFiles() del p