#!/home/ellina/packages/bin/python import os import sys import cgi import cgitb import popen2 import urllib2 try: import hashlib except ImportError: import md5 as hashlib from cStringIO import StringIO from cextract_config import Config as config sys.path.insert(0, config.code_dir) import content_extract as ce cgitb.enable() DEFAULT_USERAGENT = 'Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101' def get_page(url): request = urllib2.Request(url) request.add_header('User-Agent', DEFAULT_USERAGENT) opener = urllib2.build_opener() # get data from url data = opener.open(request).read() return data def encode(text): return text.encode('utf8', 'ignore') def produce_output_html(url, extracted_content): ec = extracted_content html_part = ce.get_xml(ec.chosen_node) _id = hashlib.md5(url).hexdigest() img_path = config.web_root + '%s.png' % _id o = StringIO() print >> o, '' print >> o, ' ' print >> o, ' %s' % (url) print >> o, ' ' print >> o, ' ' print >> o, '

%s

' % (url, url) print >> o, '

Head - Meta information

' print >> o, ' Title: %s
' % encode(ec.title) print >> o, ' Desc: %s
' % encode(ec.meta_description) print >> o, '

Extracted HTML

' print >> o, html_part print >> o, '

Extracted Keywords

' print >> o, ' ' for source, keyword in ec.keywords: print >> o, ' ' % \ (encode(keyword), encode(source)) print >> o, '
%s%s
' print >> o, '

Pruned tree

' print >> o, ' ' % img_path print >> o, ' ' % img_path print >> o, ' ' print >> o, ' ' print >> o, '' return o.getvalue() def main(url): data = get_page(url) ec = ce.extract_content(data) chosen_node = ec.chosen_node # make the dot graph stream = StringIO() nodes, links = ec.pruned_tree ce.make_dot_graph(nodes, links, id(chosen_node), stream) dot_code = stream.getvalue() _id = hashlib.md5(url).hexdigest() img_name = os.path.join(config.data_dir, '%s.png' % _id) stdout, stdin = popen2.popen2('%s -Tpng > "%s"' % (config.dot_bin, img_name)) stdin.write(dot_code) html = produce_output_html(url, ec) html_name = os.path.join(config.data_dir, '%s.html' % _id) html_path = config.web_root + '%s.html' % _id open(html_name, 'w').write(html) print 'Location: %s\n\n' % html_path if __name__ == '__main__': form = cgi.FieldStorage() url = form.getvalue('url', None) main(url)