#!/usr/bin/env python import re import sys from cStringIO import StringIO from lxml import etree #http://codespeak.net/lxml/ IGNORE_TEXT_TAGS = set(['style', 'script']) IGNORABLE_TAGS = set(['a']).union(IGNORE_TEXT_TAGS) MIN_TEXT_LEN = 50 PROPAGATE_AMOUNT = .75 MAX_WORDS = 4 STOP_WORDS = [ 'a', 'an', 'the', 'and', 'or', 'for', 'then', 'than', 'where', 'here', 'there', 'why', 'when', 'how', 'what', 'on', 'of', 'in', 'at', 'to', 'since', 'i', 'you', 'they', 'am', 'are', 'your', 'them', 'have', 'had', 'has', 'my', 'having', 'other', 'but', 'all', 'with', 'some', 'no' ] STOP_WORDS = set(STOP_WORDS) def get_tag(node): tag = node.tag if not isinstance(tag, (str, unicode)): return None tag = tag.lower() return tag def get_text(node, recurse=False, level=0, min_len=None): ''' Given a XML node, extract all the text it contains. ''' tag = get_tag(node) if not tag or tag in IGNORE_TEXT_TAGS: return '' if level == 0 else [] text = [node.text or ''] for cnode in node.getchildren(): if recurse: text.extend(get_text(cnode, recurse, level + 1, min_len=min_len)) tail = cnode.tail if tail is not None: text.append(cnode.tail) _text = '\n'.join(text).strip() if min_len is not None and len(_text) < min_len: _text = '' text = [] if level == 0: text = _text return text def get_xml(node): ''' Convert the sub-tree from node downwards into string XML representation. ''' return etree.tostring(node) def create_doc(data): ''' Construct XML tree datastructure from xml string representation. ''' parser = etree.HTMLParser() doc = etree.parse(StringIO(data), parser) return doc def get_content_nodes(doc): ''' Identify nodes in the XML document that have substantial text. ''' nodes = [] for n in doc.xpath('//*'): tag = get_tag(n) if tag in IGNORABLE_TAGS: continue text = get_text(n) if not text: continue if len(text) < MIN_TEXT_LEN: continue nodes.append(n) return nodes def compute_node_weights(doc, content_nodes): weights = {} nodes = {} for node in content_nodes: weight = len(get_text(node)) weights[id(node)] = weight nodes[id(node)] = node for ancestor in node.iterancestors(): _id = id(ancestor) a_weight = weights.setdefault(_id, 0) weight = weight * PROPAGATE_AMOUNT a_weight += weight weights[_id] = a_weight nodes[_id] = ancestor weighted_nodes = [] for _id, weight in weights.iteritems(): node = nodes[_id] weighted_nodes.append((weight, node)) weighted_nodes.sort() return weighted_nodes def remove_deep_nodes(node, threshold=0.50): allowed_deep_nodes = set(['b', 'i', 'u', 'strong', 'font', 'a', 'big', 'small', 'em']) content_len = len(get_text(node, recurse=True, min_len=MIN_TEXT_LEN)) threshold_len = int(threshold * content_len) content_seen = 0 removed_nodes = [] nodes = [node] while 1: if not nodes: break for node in nodes: content_seen += len(get_text(node, min_len=MIN_TEXT_LEN)) child_nodes = [] for node in nodes: for child in node.getchildren(): tag = get_tag(child) if content_seen >= threshold_len: if tag in allowed_deep_nodes: child_nodes.append(child) else: new_child = node.makeelement('dummy') new_child.tail = child.tail node.replace(child, new_child) removed_nodes.append(child) else: child_nodes.append(child) nodes = child_nodes return removed_nodes def make_pruned_tree(content_nodes): ''' Prune the whole XML tree by remnoving nodes other than content nodes and their ancestors. ''' nodes = {} links = {} for node in content_nodes: nodes[id(node)] = node parent = node.getparent() if parent is not None: links[id(node)] = id(parent) for anode in node.iterancestors(): _id = id(anode) parent = anode.getparent() if parent is not None: links[_id] = id(parent) if _id not in nodes: nodes[_id] = anode return nodes, links def remove_nodes_from_tree(nodes, links, remove_nodes): ids = set([id(r) for r in remove_nodes]) nodes = dict([(_id, n) for _id, n in nodes.iteritems() if _id not in ids]) links = dict([(f, t) for f, t in links.iteritems() if f not in ids and t not in ids]) return nodes, links def get_inlink_counts(links): ''' Given the inter-node links, find out which node has maximum number of links coming into it. ''' counts = {} for from_id, to_id in links.iteritems(): count = counts.setdefault(to_id, 0) counts[to_id] = count + 1 return counts def get_most_linked_node(nodes, links): ''' Identify the node which is most linked. (i,e) has most number of inlinks. ''' inlink_counts = get_inlink_counts(links) mcount, mid = max([(count, _id) for _id, count in inlink_counts.iteritems()]) node = nodes[mid] return node def get_most_weight_node(doc, content_nodes): weighted_nodes = compute_node_weights(doc, content_nodes) weight, node = weighted_nodes[-1] return node def make_dot_graph(nodes, links, chosen_node, stream): ''' Construct the dot format graph representation so that graphviz can render the tree for visualization. ''' o = stream print >> o, "digraph G {" for _id, node in nodes.iteritems(): tlen = len(get_text(node)) tag = node.tag if tlen: text = '%s (%d)' % (tag, tlen) else: text = tag if _id == chosen_node: attrs = 'style=filled color=lightblue' else: attrs = '' print >> o, "%s [label=\"%s\" %s];" % (_id, text, attrs) for fid, tid in links.iteritems(): print >> o, "%d -> %d;" % (fid, tid) print >> o, "}" class ExtractedContent: def __init__(self): self.html = None self.doc = None self.content_nodes = None self.pruned_tree = None self.chosen_node = None self.extracted_text = None self.title = None self.meta_description = None self.keywords = [] def get_head_info(doc): title = doc.xpath('string(//title)').strip() meta_description = doc.xpath('string(//meta[contains(string(\ @name), "escription")]/@content)') meta_keywords = doc.xpath('string(//meta[contains(string(\ @name), "eywords")]/@content)') meta_description = meta_description.strip() meta_keywords = [m.strip() for m in meta_keywords.split(',')] meta_keywords = [m for m in meta_keywords if m] return title, meta_description, meta_keywords def get_keywords(text): keywords = [] for kw, start, end in extract_keywords_from_blob(text): if len(kw) <= 1: continue if not is_all_caps(kw) and len(kw) <=3: continue if kw.isdigit(): continue if kw.lower() in STOP_WORDS: continue keywords.append(kw) return list(set(keywords)) def extract_keywords(node): nodes = node.xpath('.//*') nodes.append(node) keywords = [] for node in nodes: tag = get_tag(node) if tag in IGNORABLE_TAGS: continue text = node.text if not node.text: continue text = text.strip() words = [t.strip() for t in text.split(' ') if t.strip()] words = [w for w in words if w] if words and len(words) <= MAX_WORDS: keywords.append((tag, text)) text = get_text(node) for kw in get_keywords(text): keywords.append(('blob', kw)) return keywords def is_all_caps(text): return text.upper() == text def extract_keywords_from_blob(text): """ Extracts proper noun phrases like "Sanskrit Pathshala" etc from text. Searchs for capitalized word sequences. """ alpha_word = "[A-Z][A-Za-z]*" word_endings = "(?:'s|'ve|'nt|'d|'t)" #Words based on their type alpha_word = alpha_word + word_endings + "{0,1}" stop_word = "(?:of|the|in|on|and|a|de|la|at)" number_word = "[0-9]+(?:th|nd|st|rd){0,1}" #Words based on location in the phrase start_word = end_word = "(?:" + alpha_word + "|" + number_word + ")" middle_word = "(?:" + alpha_word + "|" + stop_word + "|" + number_word + ")" #Seperators that come between words sepspace = "[ ]{1,1}" sepcolon = "[ ]{0,2}:[ ]{0,2}" sepdash = '-' sepbang = "![ ]{0,2}" sepdot = "\.[ ]{0,2}" sepcomma = "[ ]{0,2},[ ]{0,2}" sepampersand = "[ ]{0,1}&[ ]{0,1}" word_sep = "(?:" + sepdash + "|" + sepcolon + "|" + sepdot + "|" + sepcomma + "|" + sepbang + "|" + sepspace + "|" + sepampersand + ")" reg = start_word + "(?:" + word_sep + middle_word + ")*" + end_word + "{0,1}" # compile the regular expression creg = re.compile(reg) results = [] for i in creg.finditer(text): item = i.group() item = item.strip() results = results + [[item, i.start(), i.end()]] return results def extract_content(html): # make doc from html data (cleans html) doc = create_doc(html) # identify content nodes content_nodes = get_content_nodes(doc) # prune xml tree to remove irrelevant nodes nodes, links = make_pruned_tree(content_nodes) # get the most linked node from pruned tree mnode = get_most_weight_node(doc, content_nodes) removed_nodes = remove_deep_nodes(mnode) nodes, links = remove_nodes_from_tree(nodes, links, removed_nodes) title, meta_description, meta_keywords = get_head_info(doc) ec = ExtractedContent() ec.html = html ec.doc = doc ec.content_nodes = content_nodes ec.pruned_tree = (nodes, links) ec.chosen_node = mnode ec.extracted_text = get_text(mnode, recurse=True) ec.title = title ec.meta_description = meta_description for mk in meta_keywords: ec.keywords.append(('meta', mk)) for ek in extract_keywords(mnode): ec.keywords.append(ek) return ec def main(): # perform content extraction ec = extract_content(sys.stdin.read()) # make the dot graph nodes, links = ec.pruned_tree chosen_node = ec.chosen_node #make_dot_graph(nodes, links, id(chosen_node), sys.stdout) if __name__ == '__main__': #Eg: wget "http://blog.prashanthellina.com" -O - | python thisscript.py | dot -Tpng -o /tmp/test.png ; eog /tmp/test.png main()