#!/usr/bin/python # extract CGI forms from a web page to standard output # in a simplified form suitable for customizing # Usage: httpdn.py [http://]url import httplib, re, string, sys # parse argument: url= sys.argv[1] if url[:7]== 'http://' : url= url[7:] ix= string.find(url, '/') if ix>= 0: host= url[:ix] file= url[ix:] # include the preceding slash else : host= url file= '/index.html' print "Host: "+ host+ " File: "+ file # here we go: hc= httplib.HTTPConnection(host) hc.request("GET", file) hr= hc.getresponse() if hr.status>= 300 and hr.status<= 399: # redirect file= hr.getheader('Location') if file[-1] != '/': file= file+ '/index.html' hc.request("GET", file) hr= hc.getresponse() html= hr.read() hc.close() print '' print '' # beheader the HTML: ix= string.find(html, '') if ix< 0: ix= string.find(html, '') if ix< 0: # there's no head head= '' body= html else : head= html[:ix+7] body= html[ix+7:] # get what we want from the head: ix =string.find(head, '= 0: # found a title ix= string.find(head, '>', ix) if ix>= 0: ix2= string.find(head, ''+ title+ '' print '' # get what we want from the body: print '' refont= re.compile('<(font|/font).*>', re.IGNORECASE) rewsbeg= re.compile('\n\s*') rewsend= re.compile('\s*\n') rewsnl= re.compile('\n+') reaction= re.compile('action="', re.IGNORECASE) resrc= re.compile('src="', re.IGNORECASE) while 1: # in case there are multiple forms ix= string.find(body, '=0 and ix2', ix) if ix< 0: break # can't be right form= body[:ix+1] form= string.replace(form, '\r\n', '\n') # more standard, easier to parse form= string.replace(form, '><', '>\n<') # more readable form= refont.sub('', form) # font directives are usually just annoying form= rewsbeg.sub('\n', form) # whitespace form= rewsend.sub('\n', form) # whitespace form= rewsnl.sub('\n', form) # whitespace form= reaction.sub('action="http://'+ host+ '/', form) # so it will connect to the correct server form= resrc.sub('src="http://'+ host+ '/', form) # so it will fetch from the correct server print form print '' body= body[ix+1:] print '' print ''