#!/usr/bin/ruby

#    Copyright (C) 2010 Richard Springs
#    This program is free software: you can redistribute it and/or
#    modify it under the terms of the GNU General Public License as
#    published by the Free Software Foundation, either version 3 of
#    the License, or (at your option) any later version.
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#    General Public License for more details.
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see
#    <http://www.gnu.org/licenses/>.

require 'rubygems'
require 'singleton'
require 'digest/md5'
require 'digest/sha1'
require 'cgi'
require 'zlib'
require 'trollop'
require 'xmlsimple'


# parse command line options
opts = Trollop::options {
	  version "scarabsieve.rb / v1.01 | richardsprings"  
	  banner <<-EOS
	  Usage: scarabsieve.rb [options]
	  EOS
	  opt :directory, "directory containing webscarab log output", :type => String
	  opt :requestlinesummary, "summarize HTTP requestlines"
	  opt :headersearch, "search HTTP headers using specified value", :type => String
	  opt :parametersearch, "search HTTP request parameters using specified value", :type => String	  
	  opt :xmlsearch, "search plist XML transmitted in HTTP responses using specified value", :type => String
	  opt :Verbose, "display verbose information"
	}
# check for mandatory arguments
Trollop::die :directory, "not specified" if opts[:directory].nil?


class ProxyData
  include Singleton

  def initialize
    @req_reqline = Array.new
    @req_headers = Array.new
    @req_messagebody = Array.new
    
    @resp_statusline = Array.new
    @resp_headers = Array.new
    @resp_body = Array.new
    @resp_body_xml = Array.new    
    
    # 0 = allrequests, 1 = headers, 2 = parameters, 3 = md5/sha1 sums carved, 4 = xmlplist
    @data_hash = Hash.new
    @tl = Time.now.localtime
  end

  # save file
  def savetofile(object, filename)
    # save attachment to file
    op_file = File.open(filename.to_s, "wb")
  	op_file.write(object)
  	op_file.close
  end

=begin
  # parse http requests or responses
  def processfile(filename)

    # determine file #
    filenum = File.basename(filename).split("-")[0].to_i
    tempstring=IO.read(filename)
  
    # break down http into more simple elements
    v_firstlineend = tempstring.index("\n")
    v_httpstart = tempstring.index("\r\n\r\n")
    # hackish
    if v_httpstart.nil? :  v_httpstart = tempstring.index("\n\n") end
    if v_httpstart.nil? :  v_httpstart = tempstring.length end
            
    firstline = tempstring[0..v_firstlineend]
    httpheaders = tempstring[v_firstlineend..v_httpstart]
    httpbody = tempstring[v_httpstart..tempstring.length].gsub(/\r\n?/,'')

    if filename.include? "request"
      @req_reqline[filenum-1] = firstline
      @req_headers[filenum-1] = httpheaders 
      @req_messagebody[filenum-1] = httpbody
    elsif filename.include? "response"
      @resp_statusline[filenum-1] = firstline
      @resp_headers[filenum-1] = httpheaders
      @resp_body[filenum-1] = httpbody
    else
      next
    end
  end
=end

# parse http requests or responses
def processfile(filename)

  # determine file #
  filenum = File.basename(filename).split("-")[0].to_i

  # normalize to unix line endings
  tempstring=IO.read(filename)

  # break down http into more simple elements
  v_firstlineend = tempstring.index("\n")
  v_httpstart = tempstring.index("\r\n\r\n")
  # hackish
  if v_httpstart.nil? :  v_httpstart = tempstring.index("\n\n") end
  if v_httpstart.nil? :  v_httpstart = tempstring.length end
          
  firstline = tempstring[0..v_firstlineend]
  httpheaders = tempstring[v_firstlineend+2..v_httpstart]
  httpbody = tempstring[v_httpstart+4..tempstring.length]

  if filename.include? "request"
    @req_reqline[filenum-1] = firstline
    @req_headers[filenum-1] = httpheaders 
    @req_messagebody[filenum-1] = httpbody
  elsif filename.include? "response"
    @resp_statusline[filenum-1] = firstline
    @resp_headers[filenum-1] = httpheaders
    @resp_body[filenum-1] = httpbody
  else
    next
  end
end


  def show_allrequests
    tempstring = String.new
    # for each request, display request # and 
    @req_reqline.each_index do |indexnumber|
     # ensure index exists
     if ! @req_reqline[indexnumber].nil?
       requesttolog = httpfirstline(@req_reqline[indexnumber])[1]
       tempstring << (indexnumber+1).to_s + "-request  " + requesttolog + "\n"
     end
    end
    @data_hash["AllRequests"] = tempstring
  end


  def httpfirstline(firstline)
    temparray = Array.new
    # parse requestline or statusline into 3 separate elements. returns an array
    temparray = firstline.split
    return temparray
  end


  def search_headers(query, indexnum)
    tempstring = String.new
    searchqueryregex = Regexp.new(/^#{query}.*\n/i)
    # search request headers
    requestheaders =  @req_headers[indexnum].gsub(/\r\n?/,"\n")    
    if requestheaders.scan(searchqueryregex).length != 0
      tempstring << (indexnum+1).to_s + "-request  "
      requestheaders.scan(searchqueryregex).each do |match|
        tempstring << match
      end
    end
    # search response headers
    responseheaders = @resp_headers[indexnum].gsub(/\r\n?/,"\n") 
    if responseheaders.scan(searchqueryregex).length != 0
      tempstring << (indexnum+1).to_s + "-response  "
      responseheaders.scan(searchqueryregex).each do |match|
        tempstring << match
      end
    end
    return tempstring
  end


  def search_allheaders(searchquery) 
    tempstring = String.new
    # for each set of headers, display index # and headers
    @req_headers.each_index do |indexnumber|
      # ensure index exists
      if ! @req_reqline[indexnumber].nil?
        tempstring << search_headers(searchquery, indexnumber)
      end
    end
    @data_hash["HeaderSearch"] = tempstring
  end  


  def search_allparameters(searchquery)
    tempstring = String.new
    # for each request, display request # and 
    @req_reqline.each_index do |indexnumber|
      # ensure index exists
      if ! @req_reqline[indexnumber].nil?      
        httpmethod, uri, protocol = httpfirstline(@req_reqline[indexnumber])
        # identify parameters for GET METHOD
        if httpmethod == "GET"
          if uri.include?('?')
            parameters = uri[uri.index('?')+1..uri.length]
          else
            parameters = ""
          end
        # identify parameters for POST METHOD
        elsif httpmethod == "POST"
            parameters = @req_messagebody[indexnumber]
        else
            next # do nothing
        end      
        # SEARCH PARAMETERS (first match, case insensitive)
        if parameters.downcase.include?(searchquery.downcase)
           param_start = parameters.downcase.index(searchquery.downcase)
           if ! parameters.index('&', param_start).nil?
             # parameter is terminated by ampersand
             param_end = parameters.index('&',param_start)-1
           else
             # parameter is terminated by line end
             param_end = parameters.length
           end
           # log findings to tempstring (URL decoded)
           parameterstolog = CGI.unescape(parameters[param_start..param_end])
           tempstring << (indexnumber+1).to_s + "-request  " + parameterstolog + "\n"
        end
      end  
    end
    @data_hash["ParameterSearch"] = tempstring
  end


  def parseuri(uri)
    # parse pseudoURI. returns an array
    temparray = uri.split('?')
    # split parameters from protocol, host, port, url
    requestlineurlnoparams = temparray[0]
        if temparray.length != 1 then
          requestlineurlparameters = temparray[1]
        else
          requestlineurlparameters = ""
        end
    # determine the index positions of pseudo uri elements
    v_protindex=requestlineurlnoparams.index('://')
    v_hostindex=requestlineurlnoparams.index(':',v_protindex+1)
    v_portindex=requestlineurlnoparams.index('/',v_hostindex)
    v_uriindex=requestlineurlnoparams.rindex('?')
    v_docindex=requestlineurlnoparams.rindex('/')
    protocol = requestlineurlnoparams[0,v_protindex]
    host = requestlineurlnoparams[v_protindex+3..v_hostindex-1]
    port = requestlineurlnoparams[v_hostindex+1..v_portindex-1]
    url = requestlineurlnoparams[v_portindex..requestlineurlnoparams.length]
    document = requestlineurlnoparams[v_docindex+1..requestlineurlnoparams.length]
    return requestlineurlnoparams,requestlineurlparameters, protocol, host, port, url, document
  end


  def gzipinflate(string)
    gz = Zlib::GzipReader.new(StringIO.new(string))
    xml = gz.read
    return xml
  end

  # calculate MD5 and SHA1 sums
  def getdigestsums(object)
    md5value = Digest::MD5.hexdigest(object)
    sha1value = Digest::SHA1.hexdigest(object)
    return md5value,sha1value
  end

  def carve(conversationsdirectory)
    tempstring = String.new
    @resp_headers.each_index do |indexnumber| 
      # ensure index exists - is there content to carve?
      if ! @resp_body[indexnumber].nil?
        contenttype = search_headers("content-type", indexnumber)
        transferencoding = search_headers("Transfer-Encoding", indexnumber)
        httpmethod, uri, protocol = httpfirstline(@req_reqline[indexnumber])
        requestlineurlnoparams, requestlineurlparameters, protocol, host, port, url, document = parseuri(uri)
        # determine content encoding type
        contentencoding = search_headers("content-encoding", indexnumber)
        # content encoding is used
        if contentencoding.length > 0
          contentencodingtype = contentencoding.split(":")[1].downcase.strip
          if transferencoding.length > 0 : transferencodingtype = transferencoding.split(":")[1].downcase.strip end
          if contentencodingtype.include?("gzip") && transferencodingtype != "chunked"
            # gzipped? inflate if so
            content = gzipinflate(@resp_body[indexnumber])
          else
            # not gzipped? undetected encoding
            content = @resp_body[indexnumber]
          end
        else
          # no content encoding
          content = @resp_body[indexnumber]
        end
        # carve html and image content types
        if contenttype.include?("image/") || contenttype.include?("text/html") || contenttype.include?("application/x-javascript")
          carvedfilename = sprintf("./#{conversationsdirectory}%04d-response.carved.#{document}", indexnumber+1)        
          savetofile(content, carvedfilename)
          # md5/sha1 analysis
          tempstring << "#{getdigestsums(@resp_body[indexnumber])[0]} #{getdigestsums(@resp_body[indexnumber])[1]} #{File.basename(carvedfilename)}\n"
        # carve text/xml content type
        elsif contenttype.include?("text/xml")
          carvedfilename = sprintf("./#{conversationsdirectory}%04d-response.carved.xml", indexnumber+1)
          savetofile(content, carvedfilename)
          # md5/sha1 analysis
          tempstring << "#{getdigestsums(@resp_body[indexnumber])[0]} #{getdigestsums(@resp_body[indexnumber])[1]} #{File.basename(carvedfilename)}\n"
          # save xml in array
          @resp_body_xml[indexnumber] = content
        end

      end
    end 
    @data_hash["DigestSums"] = tempstring
  end


  def plistxmlsearch(query)
    tempstring = String.new
    # for each position in array
    @resp_body_xml.each_index do |indexnumber|
      # ensure index exists
      if ! @resp_body_xml[indexnumber].nil?
        xml=@resp_body_xml[indexnumber]
        #  plist xml is NOT fun, so use regex hack that counts on following format: <key>plist</key><string>notfun</string> 
        plistxmlsearchregex = Regexp.new(/<key>.*#{query}.*<\/string>/i)
        @resp_body_xml[indexnumber].scan(plistxmlsearchregex).each do |match|
          # create XML root and parse XML
          newxml = XmlSimple.xml_in("<root>" + match + "</root>")
          searchresults = "#{indexnumber+1}-response  #{newxml['key']}: #{newxml['string']}\n"
          tempstring << searchresults 
        end  
      end
    end
    @data_hash["PlistXMLSearch"] = tempstring
  end

  def report
    
    timestamp = sprintf("%04d%02d%02d%02d%02d%02d", @tl.year, @tl.month, @tl.day, @tl.hour, @tl.min, @tl.sec)
    
    reportstring = String.new 
    
    if ! @data_hash["AllRequests"].nil?
      reportstring << "---- request line summary " + "-" * 50 + "\n\n"
      reportstring << @data_hash["AllRequests"]
    end

    if ! @data_hash["DigestSums"].nil?
      reportstring << "\n\n---- digest sums of carved files (MD5 SHA1 FILENAME) " + "-" * 50 + "\n\n"
      reportstring << @data_hash["DigestSums"]
    end
    
    if ! @data_hash["HeaderSearch"].nil?
      reportstring << "\n\n---- header search " + "-" * 50 + "\n\n"
      reportstring << @data_hash["HeaderSearch"]
    end
    
    if ! @data_hash["ParameterSearch"].nil?
      reportstring << "\n\n---- parameter search " + "-" * 50 + "\n\n"
      reportstring << @data_hash["ParameterSearch"]
    end
    
    if ! @data_hash["PlistXMLSearch"].nil?
      reportstring << "\n\n---- plist xml search " + "-" * 50 + "\n\n"
      reportstring << @data_hash["PlistXMLSearch"]
    end

    puts "timestamp: #{@tl}\n\n"
    puts reportstring  
    
  end

  
# CLASS END
end



# initialize data
httpdata = ProxyData.instance

# identify requests
webscarabrequests = opts[:directory] + "*-request"

# process each request/response pair
Dir.glob(webscarabrequests) do |file|
 httpdata.processfile(file) 
 httpdata.processfile(file.gsub!("request", "response"))  
end




# all requests
if opts[:requestlinesummary] : httpdata.show_allrequests end

# header search
if opts[:headersearch] : httpdata.search_allheaders(opts[:headersearch]) end

# parameter search
if opts[:parametersearch] : httpdata.search_allparameters(opts[:parametersearch]) end

# file carving
httpdata.carve(opts[:directory])

# plist xml data search
if opts[:xmlsearch] : httpdata.plistxmlsearch(opts[:xmlsearch]) end

# report
httpdata.report

