#!/home/ellina/packages/bin/python
import os
import sys
import cgi
import cgitb
import popen2
import urllib2
try:
import hashlib
except ImportError:
import md5 as hashlib
from cStringIO import StringIO
from cextract_config import Config as config
sys.path.insert(0, config.code_dir)
import content_extract as ce
cgitb.enable()
DEFAULT_USERAGENT = 'Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'
def get_page(url):
request = urllib2.Request(url)
request.add_header('User-Agent', DEFAULT_USERAGENT)
opener = urllib2.build_opener()
# get data from url
data = opener.open(request).read()
return data
def encode(text):
return text.encode('utf8', 'ignore')
def produce_output_html(url, extracted_content):
ec = extracted_content
html_part = ce.get_xml(ec.chosen_node)
_id = hashlib.md5(url).hexdigest()
img_path = config.web_root + '%s.png' % _id
o = StringIO()
print >> o, ''
print >> o, '
'
print >> o, ' %s' % (url)
print >> o, ' '
print >> o, ' '
print >> o, ' ' % (url, url)
print >> o, ' Head - Meta information
'
print >> o, ' Title: %s
' % encode(ec.title)
print >> o, ' Desc: %s
' % encode(ec.meta_description)
print >> o, ' Extracted HTML
'
print >> o, html_part
print >> o, ' Extracted Keywords
'
print >> o, ' '
for source, keyword in ec.keywords:
print >> o, ' | %s | %s |
' % \
(encode(keyword),
encode(source))
print >> o, '
'
print >> o, ' Pruned tree
'
print >> o, ' ' % img_path
print >> o, '
' % img_path
print >> o, ' '
print >> o, ' '
print >> o, ''
return o.getvalue()
def main(url):
data = get_page(url)
ec = ce.extract_content(data)
chosen_node = ec.chosen_node
# make the dot graph
stream = StringIO()
nodes, links = ec.pruned_tree
ce.make_dot_graph(nodes, links, id(chosen_node), stream)
dot_code = stream.getvalue()
_id = hashlib.md5(url).hexdigest()
img_name = os.path.join(config.data_dir, '%s.png' % _id)
stdout, stdin = popen2.popen2('%s -Tpng > "%s"' % (config.dot_bin, img_name))
stdin.write(dot_code)
html = produce_output_html(url, ec)
html_name = os.path.join(config.data_dir, '%s.html' % _id)
html_path = config.web_root + '%s.html' % _id
open(html_name, 'w').write(html)
print 'Location: %s\n\n' % html_path
if __name__ == '__main__':
form = cgi.FieldStorage()
url = form.getvalue('url', None)
main(url)