#!/usr/bin/env python
#
#       smtpParser.py
#       
#       Copyright 2009 Serge Gorbunov <sgorbunov@hotmail.com> 
#		<http://gserge.com>
#       
#       This program is free software; you can redistribute it and/or modify
#       it under the terms of the GNU General Public License as published by
#       the Free Software Foundation; either version 2 of the License, or
#       (at your option) any later version.
#       
#       This program is distributed in the hope that it will be useful,
#       but WITHOUT ANY WARRANTY; without even the implied warranty of
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#       GNU General Public License for more details.
#       
#       You should have received a copy of the GNU General Public License
#       along with this program; if not, write to the Free Software
#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#       MA 02110-1301, USA.

# Scapy collection of classes is required to this program.
# Download: http://www.secdev.org/projects/scapy/

import sys, getopt, urlparse, string
import re, os, tempfile
import base64
from   scapy.all       import *
from   streamExtractor import *
from   smtp            import *

# Global Variables
# ----------------------------------------------------------------------
# outputFolderName is the name of the folder followed by the index
# of the SMTP stream. Analysis of every separate SMTP stream are
# stored in separated directories for clarity. 
outputDirName     = "stream" 
outputSummaryFile = "streamSummary.txt"

defaultSMTPport           = 587
defaultExtractImages      = False
defaultExtractAttachments = False

class smtpParser():
	# Fuction: Extracts SMTP from the TCP stream and dumps it into summary file
	#          Attatchments are also decoded and saved in the output
	#          directory. 
	#
	# Input:   stream - TCP data stream
	#          dirName - Directory where summary and attatchments will be saved
	#          fileName - Name of the file where email infor will be stored. 
	#
	# Output:  Mail object containing basic information about the mail extracted
	def extractSMTPinfo( self, stream ):
		
		# Since a request from the server might not necessary
		# follow by the reply from the client in the pcap, we simply record
		# the ACK number from the request for the given field and later
		# when a packet with corresponding Seq is met, we extract its 
		# payload and store in the appropriate field. 
		usernameSeq    = None
		passwordSeq    = None
		dataSeq        = None
		dataAck        = None
		attachmentData = False
		dataArray      = ''
		attachmentName = ''
		
		mail = SMTPmail( stream[0][Ether][IP].src,
						 stream[0][Ether][IP].dst,
						 stream[0][Ether][IP][TCP].sport,
						 stream[0][Ether][IP][TCP].dport )
						 
		# Iterate through the stream and looking for basic email headers
		# for packet in stream:
		for packetIndex in range (0, len(stream)):
			packet = stream[packetIndex]
			attatchments = []
			
			try:
				seq = packet[Ether][IP][TCP].seq
				ack = packet[Ether][IP][TCP].ack
				arr = packet[Ether][IP][TCP][Raw].load.split(' ')
				
				for index in range (0, len(arr)):
					# Look for the introduction field
					if arr[index] == 'EHLO' or arr[index] == 'ECHO':
						client = arr[index+1].rstrip("\r\n")
						mail.client = client
					# Username request
					elif arr[index] == '334' and arr[index+1] == 'VXNlcm5hbWU6\r\n':
						usernameSeq = packet[Ether][IP][TCP].ack
					# Password request
					elif arr[index] == '334' and arr[index+1] == 'UGFzc3dvcmQ6\r\n':
						passwordSeq = packet[Ether][IP][TCP].ack
					elif arr[index] == 'MAIL' and arr[index+1] == 'FROM:':
						mail.mailfrom = arr[index+2].rsplit("\r\n")[0]
					elif arr[index] == 'RCPT' and arr[index+1] == 'TO:':
						mail.mailto = arr[index+2].rsplit("\r\n")[0]
					# Once the 
					elif arr[index] == '354':
						dataSeq = packet[Ether][IP][TCP].ack
				
				if ( usernameSeq != None and usernameSeq == seq ):
					mail.username = self.getPayloadAndDecode( packet )
					usernameSeq = None
				elif ( passwordSeq != None and passwordSeq == seq ):
					mail.password = self.getPayloadAndDecode( packet )
					passwordSeq = None
				elif ( dataSeq != None and dataSeq == seq):
					dataAck = packet[Ether][IP][TCP].ack
				
				# If the ack of the packet mathes to the ack number for the date response
				# Then we attatch the data to the dataArray
				if ( dataAck != None and dataAck == ack ):
					mail.dataArray = "%s%s" % ( mail.dataArray, packet[Ether][IP][TCP][Raw].load )
					if  re.search(".filename", packet[Ether][IP][TCP][Raw].load):
						# Add the attachment name to the dic of attachments 
						m = re.search( r'filename=(.*)', packet[Ether][IP][TCP][Raw].load)
						attachmentName = m.group().split('"')[1]
						data = arr[len(arr)-1].split("\r\n\r\n")[1]
						
						# Add a new attachment with its beginning of the data stream
						mail.attachments[attachmentName] = data
						attachmentData = True
						
					# If attachment data is coming then add it to the specified attachment data stream
					elif (attachmentData == True):
						if (re.search("\r\n\r\n", packet[Ether][IP][TCP][Raw].load )):
							attachmentData = False
							data = arr[len(arr)-1].split("\r\n\r\n")[0]
							mail.attachments[attachmentName] += data
						else:
							mail.attachments[attachmentName] += packet[Ether][IP][TCP][Raw].load
			except:
				pass

		return mail

	# Function: Extracts the data payload from the packet
	# 
	# Input:    packet
	#
	# Output:   SMTP payload
	def getPayloadAndDecode( self, packet ):
		data = packet[Ether][IP][TCP][Raw].load.rsplit("\r\n")[0]
		return self.decodeString( data )
	
	# Function: Decodes a string from base 64
	# 
	# Input:    Base 64 encoded string
	#
	# Ouput:    Plain string
	def decodeString( self, str ):
		return base64.b64decode( str )

	# Function: Prints mail information and dump the attachments from email object
	#
	# Input:    mail         - Mail object
	#           dir          - Output directory for the stream
	#           outputFile   - Output filename for the actual email 
	#           attSwitch    - Extract attachments from the email (True/False)
	#           imagesSwitch - Extract images from any docx files (True/False)
	#           
	# Output:   True/False
	def dumpSMTPinfo( self, mail, dir, outputFile, attSwitch, imagesSwitch ):
		if os.path.exists( dir ):
			print "%s directory already exists. " % dir
			print "Please specify a different directory for data streams" 
			return False
		else:
			os.mkdir( dir )
		
		outputFile = dir + "/" + outputFile
		if os.path.exists( outputFile ):
			print "%s file already exists" % outputFile
			return False
			
		f = open( outputFile, 'w')
		
		f.write( "###___General_Mail_Info___###\n\n"        )
		f.write( "Source IP:      %s\n"     % mail.src      )
		f.write( "Destanation IP: %s\n"     % mail.dst      )
		f.write( "Client ID:      %s\n\n"   % mail.client   )
		f.write( "Source port:      %s\n"   % mail.sport    )
		f.write( "Destanation Post: %s\n\n" % mail.dport    )
		f.write( "Username: %s\n"           % mail.username )
		f.write( "Password: %s\n\n"         % mail.password )
		f.write( "Mail From: %s\n"          % mail.mailfrom )
		f.write( "Mail To:   %s\n"          % mail.mailto   )
		f.write( "\n"                                       )
		
		# The attachments in mail objects are stored in the dictionary
		# with attachment name -> binary data decoded from base 64
		# Iterate through every attachment and store it with the 
		# corresponding name
		if ( True == attSwitch ):
			for att, data in mail.attachments.items():
				attachment = open( dir + "/" + att, "w" )
				attachment.write( self.decodeString( data ) )
				attachment.close()
				f.write( "Attachment checksum found in the mail:\n" )
				cmd = 'md5sum ' + dir + '/' + att
				run = os.popen( cmd )
				checksum = run.read().split(' ')[0]
				f.write( "%s %s\n" % ( att, checksum ) )
				if ( True == imagesSwitch ):
					media = self.extractImagesFromDOCX( dir + "/" + att, dir )
					if ( False != media ):
						f.write( "\tMedia checksums found in the attachment:\n" )
						for k, v in media.iteritems():
							f.write ( "\t%s %s\n" % ( k, v ) )
							
		f.write( "\n"                                       )
		f.write( "###___Mail_DATA___###\n\n"                )
		f.write( mail.dataArray                             )
		f.write( "\n"                                       )
		f.close()
		
		return True

	# Function: Extracts any images from docx files stored inside 
	#           and copies them into the destanation directory.
	# 
	# Input:    filename - DOCX filepath
	#           dstDir   - Destanation directory for the images
	#
	# Output:   A dictionary of MD5 checksums for every image found
	#           False otherwise
	def extractImagesFromDOCX( self, filename, dstDir ):
		if ( not os.path.exists( filename ) ):
			return False
		media = {}
		tempDir = tempfile.mkdtemp()
		
		# Extract docx into a temp directory
		cmd = 'unzip ' + filename + ' -d ' + tempDir + '>> /dev/null'
		run = os.system( cmd )
		cmd = tempDir + '/word/media'
		mediaList = os.listdir( cmd )
		
		# Iterate through the media content and collect checksums
		for f in mediaList:
			filepath = tempDir + '/word/media/' + f
			cmd = 'cp ' + filepath + ' ' + dstDir
			os.system( cmd )
			
			cmd = 'md5sum ' + filepath
			run = os.popen( cmd )
			media[f] = run.read().split(' ')[0]
			
		cmd = 'rm -rf ' + tempDir
		os.system( cmd )
		return media
	

def usage(basename):
	print "\nSMTP Parser"
	print "Usage:"
	print "\t %s -f <pcap_source_file> [options] " % basename
	print ""
	
	print "\t -f (--file) Mandatory option followed by the pcap input file name"
	print ""
	
	print "[Options]"
	print "\t -h (--help)        Print the help page"
	print "\t -d (--destanation) Destination path for output streams"
	print "\t -p (--port)        SMTP destination port number (default is 587)"
	print "\t -a (--attachments) Extract attachments from the emails"
	print "\t -i (--images)      Extract images from any docx files"
	
	print ""
	
	print "Examples"
	print "\t %s -f evidence.pcap"                      %basename
	print "\t %s -f evidence.pcap -p 25"                %basename
	print "\t %s -f evidence.pcap -d /home/user/"       %basename
	print "\t %s -f evidence.pcap -a -i"                %basename


def main(argv):
	
	dataStreams             = []	
	destanationPath         = None
	inFile                  = None
	smtpPort                = defaultSMTPport
	extractImagesSwitch     = defaultExtractImages
	extractAttSwitch        = defaultExtractAttachments
	
	# Check options used to run the program
	try:
		opts, args = getopt.getopt(sys.argv[1:], 
								   "hf:d:p:ai", 
								   ["help", "file=", "destanation=",
								    "port=", "attachments", "images"])
								    
	except getopt.GetoptError, err:
		# print help information and exit
		print str(err)
		usage(sys.argv[0])
		sys.exit(2)
	
	for opt, val in opts:
		if opt in ("-h", "--help"):
			usage(sys.argv[0])
			sys.exit()
		elif opt in ("-f", "--file"):
			inFile = val
		elif opt in ("-d", "--destanation"):
			destanationPath = val
		elif opt in ("-p", "--port"):
			smtpPort = val
		elif opt in ("-a", "--attachments"):
			extractAttSwitch = True
		elif opt in ("-i", "--images"):
			extractImagesSwitch = True
		else:
			assert False, "Unrecognized Option"
	
	# Check if -f option was not used, then print usage and exit
	if (inFile == None):
		print "ERROR: No input file specified"
		usage(sys.argv[0])
		sys.exit()
	
	# Check if valid destanation path was specified
	if ( None != destanationPath ):
		if ( not os.path.exists( destanationPath ) or not os.path.isdir( destanationPath ) ):
			print "ERROR: Invalid destanation path specified"
			sys.exit()
	
	# Extract data streams 
	se = streamExtractor()
	dataStreams = se.extractStreams(inFile, smtpPort)
	
	counter = 0
	if ( 0 == len( dataStreams ) ):
		print "No streams were extracted for destanation port %s" % smtpPort
	else:
		print "%d streams extracted from the file on port %d" % ( len( dataStreams ), smtpPort )
		
	# Dump every stream into separate dir/summaryFile
	# Extract attachments and images from DOCX files if specified
	sp = smtpParser()
	for dataStream in dataStreams:
		if ( None != destanationPath):
			dir = destanationPath + '/' + outputDirName + str(counter)
		else:
			dir = outputDirName + str(counter)
			
		# Extractor extract the content of the mail into an object
		# Dumper stores the content/attachments/images on the hard-drive
		mail = sp.extractSMTPinfo( dataStream )
		print 'Storing stream #%s: srcIP = %s; dstIP = %s into %s' \
		% ( counter, mail.src, mail.dst, os.path.abspath(dir) )
		if ( True == sp.dumpSMTPinfo( mail, dir, outputSummaryFile, extractAttSwitch, extractImagesSwitch ) ):
			print "Successfully stored stream #%s" % (counter)
			print ""
		else:
			print "Error storing stream #%s" % (counter)
			print ""
		counter += 1


if __name__ == '__main__': 
	main(sys.argv[0:])



