#!/usr/bin/env python import sys import re import string MAX_LINE_LENGTH = 1024 MAX_WORD_LENGTH = 50 LINE_DELIMITERS = set('.,;:!?') RE_HAS_LINE_DELIM = re.compile('[\.,;:!?]') RE_NON_ACCEPTIBLE_CHARS = re.compile('[^A-Za-z0-9\']') RE_REMOVABLE_QUOTES = re.compile('([^A-Za-z])\'([^A-Za-z])') RE_WHITESPACE = re.compile('\s+') def process_line(line): if not isinstance(line, str): line = ' '.join(line) if len(line) > MAX_WORD_LENGTH: return line = RE_NON_ACCEPTIBLE_CHARS.sub(' ', line) line = RE_REMOVABLE_QUOTES.sub(r'\1 \2', line) line = line.lower() line = RE_WHITESPACE.sub(' ', line) line = line.strip() words = line.split(' ') for w in words: if len(w) > MAX_WORD_LENGTH: return # get 1,2,3 grams for i in xrange(len(words)): uni_gram = words[i:i+1] bi_gram = words[i:i+2] tri_gram = words[i:i+3] if len(uni_gram) == 1: print ' '.join(uni_gram) if len(bi_gram) == 2: print ' '.join(bi_gram) if len(tri_gram) == 3: print ' '.join(tri_gram) def main(): line = [] for l in sys.stdin: line_sum = sum([len(x) for x in line]) if line_sum > MAX_LINE_LENGTH: line = [] lines = RE_HAS_LINE_DELIM.split(l) if len(lines) == 1: line.append(lines[0]) else: line_segment = lines.pop(0) line.append(line_segment) process_line(line) line = [lines.pop(-1)] for l in lines: process_line(l) if __name__ == '__main__': main()