import os import os.path import string import sys import urllib try: set() except NameError: import sets; set = sets.Set ALPHANUMERALS = set(string.letters + string.digits) WGET_CMD = 'wget -O "%s" --quiet "%s"' def clean(text): return ''.join([c for c in text if c in ALPHANUMERALS]) def save_link_to_file(link, fname): print 'getting link "%s" to file "%s" ...' % (link, fname) error = os.system(WGET_CMD % (fname, link)) return bool(error) def main(output_dir): books = [] for l in sys.stdin: l = l[:-1] book_data = eval(l) books.append(book_data) if not os.path.exists(output_dir): os.mkdir(output_dir) for class_number, subject, book, html_link, image_link, pdf_links in books: subject = clean(subject) book = clean(book) book_name = 'class_%(class_number)s.%(subject)s.%(book)s' % locals() book_dir = os.path.join(output_dir, book_name) if not os.path.exists(book_dir): os.mkdir(book_dir) image_path = os.path.join(book_dir, 'image.jpg') error = save_link_to_file(image_link, image_path) if error: print >> sys.stderr, 'could not get image link "%s"' % (image_link) for index, pdf_link in enumerate(pdf_links): pdf_fname = os.path.basename(pdf_link) pdf_fname = urllib.unquote(pdf_fname).decode('latin1').encode('ascii', 'ignore') pdf_fname = os.path.join(book_dir, pdf_fname) error = save_link_to_file(pdf_link, pdf_fname) if error: print >> sys.stderr, 'could not get pdf link "%s"' % (pdf_link) if __name__ == '__main__': if len(sys.argv) < 2: print 'Usage: %s ' % (sys.argv[0]) sys.exit(2) output_dir = sys.argv[1] main(output_dir)