#!/usr/bin/python
#
# MiniGoofil - Metadata analyzer, information gathering tool
# A rewrite of MetaGoofil to make sense (TM) under GNU GPL
# v0.1 (c) 2009 Francesco `ascii` Ongaro
# http://www.ush.it/team/ascii/hack-minigoofil/minigoofil.py
#
# MiniGoofil is an utility to extract metadata from various
# document formats, like PDF, DOC, XLS, PPT, ODP, ODS, etc.
# 
# MiniGoofil has been created since MetaGoofil is a great tool
# but horribly written and hard to expand. MiniGoofil overcomes
# these limitations and is written in the Python Way.
#
# You can find the original tool, developed by Christian 
# Martorella (Laramies) at the following url:
# http://www.edge-security.com/metagoofil.php

import os, subprocess, sys, re

definitions = [
	# Metagoofil definitions
	['Author', 'Author -(.*?)- '],
	['Creator', 'creator -(.*?)- '],
	['Last saved by', 'last saved by -(.*?)- '],
	['Author', ': Author \'(.*?)\''],
	['Worked on', 'worked on (.*?)\''],
	['Template', 'template -(.*?)-'],

	# Adobe PDF "long xpacket"
	['XMP Toolkit', '<x:xmpmeta xmlns:x=\'adobe:ns:meta/\' x:xmptk=\'(.*)\'>'],
	['Producer', '<rdf:Description.*?pdf:Producer=\'(.*?)\'></rdf:Description>'],
	['Date Created', '<rdf:Description.*?xap:CreateDate=\'(.*?)\'.*?</rdf:Description>'],
	['Creator', '<rdf:Description.*?xap:CreatorTool=\'(.*?)\'.*?</rdf:Description>'],
	['Date Modified', '<rdf:Description.*?xap:ModifyDate=\'(.*^)\'.*?</rdf:Description>'],
	['Document ID', '<rdf:Description.*xapMM:DocumentID=\'(.*?)\'/>'],
	['Title', '<dc:title><rdf:Alt><rdf:li.*?>(.*?)</rdf:li></rdf:Alt></dc:title>'],
	['Creator', '<dc:creator><rdf:Seq><rdf:li>(.*?)</rdf:li></rdf:Seq></dc:creator>'],

	# Adobe PDF "short xpacket"
	['Date Modified', '/?ModDate\((.*?)\)/?'],
	['Date Created', '/?CreationDate\((.*?)\)/?'],
	['Title', '/?Title\((.*?)\)/?'],
	['Creator', '/?Creator\((.*?)\)/?'],	
	['Creator Xref', '/?xrefbjcer\((.*?)\)/?'],
	['Producer', '/?Producer\((.*?)\)/?'],	
	['Author', '/?Author\((.*?)\)/?'],
	['Company', '/?Company\((.*?)\)/?'],
	['Source Modified', '/?SourceModified\((.*?)\)/?'],
]

def mac_address_extractor(filename):
	line=open(filename, 'r')
	res=""
	for l in line:
		res+=l
	
macrex=re.compile('-[0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z]\}')	
	macgetter=macrex.findall(res)
	if macgetter==[]:
		mac=''
	else:
		mac=macgetter[0]
		mac=mac.strip("-")
		mac=mac.strip("}")
		mac=mac[:2]+":"+mac[2:4]+":"+mac[4:6]+":"+mac[6:8]+":"+mac[8:10]+":"+mac[10:12]
	return mac

def main():
	if not len(sys.argv) is 2:
		print 'Usage: ./minigoofil filename.pdf'
		sys.exit(2);

	filename = sys.argv[1]
	
	if not os.path.isfile(filename):
		print 'File not found!'
		sys.exit(2);

	filetype = subprocess.Popen(['extract', filename], stdout=subprocess.PIPE).communicate()[0].splitlines()
	print 'Working >> ' + filename + ' << ' + ''.join(filetype)

	mac = mac_address_extractor(filename)
	if mac:
		print "\t" + 'Mac address:' + mac

	fp = open(filename, 'r')
	for line in fp:
		i = 0
		for definition in definitions:
			results = re.compile(definition[1]).findall(line)
			for result in results:
				print "\t" + str(i) + "\t" + definition[0] + ': ' + sanitize(result)
			i+=1

def sanitize(src):
	result = []
	for i in xrange(0, len(src)):
		s = src[i]
		result += b''.join([x if 0x20 <= ord(x) < 0x7F else b'.'  for x in s])
	return b''.join(result)

if __name__ == "__main__":
    sys.exit(main())

