lundi 20 juin 2016

Tweepy to not store duplicate tweets into database

I'm using Tweepy to get tweets and store all the tweets into database. but the problem I'm facing right now is Tweepy also store duplicate tweets into database.

here's the code I use below:

from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
from flask_sqlalchemy import SQLAlchemy
from models import TrainingTweets, db
import mysql.connector
import json
import tweepy
from tweepy.api import API

#consumer key, consumer secret, access token, access secret.
ckey=""
csecret=""
atoken="-"
asecret=""

auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)

api = tweepy.API(auth)


class listener(StreamListener):

    def __init__(self, api=None):
        self.api = api or API()
        self.n = 0
        self.m = 50

    def on_data(self, data):
        all_data = json.loads(data)
        self.n = self.n+1
        if self.n <= self.m:
            tweet = all_data["text"]
            username = all_data["user"]["screen_name"]
            label = "1"
            ttweets = TrainingTweets(label_id=label, tweet_username=username, tweet=tweet)
            db.session.add(ttweets)
            db.session.commit()
            print((username, tweet))
            return True
        else:
            print("Successfully stored ", self.m, " tweets into database")
            return False

    def on_error(self, status):
        print(status)

auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)

twitterStream = Stream(auth, listener())
twitterStream.filter(track=["health"], languages=["en"], follow="")

can anyone help me to make Tweepy only store one tweet instead of all duplicate tweets into database?

Aucun commentaire:

Enregistrer un commentaire