deduplication jaccard python
s1 = "what's the flight time from Berlin to Helsinki?"
s2 = "how long does it take to fly from Berlin to Helsinki?"
shingles1 = set([s1[max(0, i-4):i] for i in range(4, len(s1) + 1)])
shingles2 = set([s2[max(0, i-4):i] for i in range(4, len(s2) + 1)])
len(shingles1 & shingles2) / len(shingles1 | shingles2)