import re import os import sys import urllib2 from urlparse import urljoin BASE_LINK = 'http://www.ncert.nic.in' USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows XP)' def get_page(url): cmd = 'wget -O - --quiet "%s"' % (url) data = os.popen(cmd).read() return data def get_book_urls(listing_page_html): data = listing_page_html script_data = re.findall("SCRIPT.*SCRIPT(?s)", data) script_data = script_data[0] books = [] for i in re.findall("document.test.tclass.value==([0-9]{1,2})(.*?})(?s)", script_data): class_number, data = i class_number = int(class_number) subject_name = re.findall("document.test.tsubject.options\[sind\].text==\"(.*?)\"", data) if subject_name: subject_name = subject_name[0] book_data = re.findall("\.(text|value)=\"(.*?)\"(?s)", data) text = None for type, data in book_data: data = data.strip() if type == 'text': text = data if type == 'value': if not text: continue data = [class_number, subject_name, text, data] books.append(data) return books def get_data_from_book_page(book_page_html): data = book_page_html pdf_link = re.findall("href *= *[\"'](.*?)[\"']", data) if pdf_link: pdf_link = urljoin(BASE_LINK, pdf_link[0]) else: pdf_link = None image_link = re.findall("src *= *[\"'](.*?)[\"']", data) if image_link: image_link = urljoin(BASE_LINK, image_link[0]) else: image_link = None return (pdf_link, image_link) def get_pdf_links(pdf_data): links = re.findall("URI\((.*?)\)/S/URI", pdf_data) links = [urljoin(BASE_LINK, l) for l in links if l.lower().strip().endswith("pdf")] links = [l.replace('\\', '') for l in links] return links def main(): listing_page_html = get_page("http://www.ncert.nic.in/textbooks/testing/first.htm") book_links = get_book_urls(listing_page_html) final_data = [] for class_number, subject_name, text, link in book_links: link = urljoin(BASE_LINK, link) book_page_html = get_page(link) if not book_page_html: print >> sys.stderr, "could not fetch book html page: ", link continue pdf_link, image_link = get_data_from_book_page(book_page_html) if not pdf_link and link.lower().endswith('.pdf'): pdf_link = link pdf_links = [] pdf_links.append(pdf_link) pdf_data = get_page(pdf_link) if not pdf_data: print >> sys.stderr, "could not fetch book pdf page: ", link continue extracted_pdf_links = get_pdf_links(pdf_data) pdf_links.extend(extracted_pdf_links) data = [class_number, subject_name, text, link, image_link, pdf_links] print data final_data.append(data) return final_data if __name__ == '__main__': main()