PyDiff Lightning Talk Special - 21st March 2017
I have evidence!
It is remarkably easy to fake a twitter account...
# assume there are more tweets to retrieve
more_tweets = True
# while there are more tweets to retrieve
while(more_tweets):
# make an api call to get the tweets prior
# to our earliest retrieved tweet so far
params = {
"screen_name": am,
"count": 200,
"exclude_replies": "false",
"max_id": earliest_id
}
new_tweets = ta.query_get("statuses", "user_timeline", params)
# add the newly retrieved tweets to our list
tweets.extend(new_tweets)
# find the earliest retrieved tweet
current_earliest = earliest_id
for tweet in tweets:
if tweet["id"] < earliest_id:
earliest_id = tweet["id"]
# if the earliest tweet hasn't changed
# we can't go back any further
if current_earliest == earliest_id:
more_tweets=False
We want to generate realistic looking text
Text that looks like Vince wrote it.
Given a word `A`, what is the probability that Vince uses the word `B` next?
Given a sequence `A, B`, what is the probability that Vince uses the word `C` next?
data = tweets_file.read()
words = data.split()
words = [word for word in words if word.find('http') < 0 and word.find('@') < 0]
return words
def triples(words):
if len(words) < 3:
return
for i in range(len(words) - 2):
yield (words[i], words[i+1], words[i+2])
def database(words):
cache = {}
for w1, w2, w3 in triples(words):
key = (w1, w2)
if key in cache:
cache[key].append(w3)
else:
cache[key] = [w3]
return cache
def generate_markov_text(size=25, words, cache):
seed = random.randint(0, len(words)-3)
seed_word, next_word = words[seed], words[seed+1]
w1, w2 = seed_word, next_word
gen_words = []
for i in range(size):
gen_words.append(w1)
w1, w2 = w2, random.choice(cache[(w1, w2)])
gen_words.append(w2)
return ' '.join(gen_words)
The conspiracy is real
All our Twitter accounts are fake