Njitreuluis.files.wordpress.com



June 22nd, 2017More on PythonToday we got a quick demonstration on how to do RapidMiner’s job on python using libraries like pandas, numpy and Scikit-learn. After the demonstration, all of us got to speak about each other’s progress so we can learn from one another and make sure we are on the right path. Prof. Lian Duan took his time to give us some advice on what to do next, specially to those whose mentor has been gone for the week.I started working on splitting the tweets to gather the most used words and icons in the tweets related to cancer. I achieved this by using the Natural Language Toolkit from python to split the tweets into tokens, and then I sorted them by valueProgram used:from tweepy.streaming import StreamListenerfrom tweepy import OAuthHandlerfrom tweepy import Streamfrom Keys import *import operator #For sortingimport json #To turn the data that comes from Twitter in a JSON format to a Dictionary formatimport csvimport nltk #Natural Language Toolkit for splitting the tweets into wordsfrom nltk.corpus import treebankQuery = ["cancer"]File_path = "E:\REU Program\Data\Tweets_2.csv"word_count = {}WroteLabels = False #flag used to stop re-reading the file to check for already existing labels unnecessarily#To stop the stream, cancel the program or press Ctrl+C in the consoleprint("Attempting to make connection with the Twitter API...")class listener(StreamListener): def __init__(self): print("Connection stablished... Gathering live tweets with the terms ", Query) print("This is a continuous stream of data... To stop the stream, press Ctrl + C on the terminal or Stop the program") labels = ["Date", "Tweet", "Retweet", "Location", "Name", "Username", "Description", "Followers Count", "Friends Count", "Language", "Timezone", "iPhone User"] SaveToCsv(File_path, labels) def on_data(self, raw_data): global word_count print(raw_data) data = json.loads(raw_data) #Extracting attributes from the data. has_iPhone = str(raw_data).__contains__("iPhone") tweet = data["text"] retweet = False if "RT @" in tweet: tweet = data["retweeted_status"]["text"] retweet = True time = data["created_at"] user = data["user"] location = user["location"] name = user["name"] screen_name = user["screen_name"] description = user["description"] followers_count = user["followers_count"] friends_count = user["friends_count"] language = user["lang"] time_zone = user["time_zone"] #printing to show progress print(tweet) #Splitting the tweets into words. #TODO: Correct word misspelling. #TODO: Don't count hyperlinks or special characters like '/' tokens = (nltk.word_tokenize(tweet)) for word in tokens: word = word.lower() if word not in word_count: word_count[word] = 1 else: word_count[word]+= 1 tagged = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tagged) print(tagged) print("||||") print(entities) word_count_sorted = sorted(word_count.items(), key=operator.itemgetter(1)) print("|||") #To separate each print. print(word_count_sorted) #Final use csv_data = [time, tweet, retweet, location, name, screen_name, description, followers_count, friends_count, language, time_zone, has_iPhone] SaveToCsv(File_path, csv_data) # print(word_count) def on_error(self, status_code): print ("There was an error...") print("Status code", status_code) return True #Don't kill the stream def on_timeout(self): print ('Timeout...') return True # Don't kill the streamdef SaveToCsv(path, csv_data): #Using the global variable called Flag global WroteLabels #Preventing the duplication of labels if not WroteLabels: try: readable = open(File_path, "r", encoding="utf-8") lines = readable.readlines() #We assume that the first line will contain the attributes #So we just check if the first value of the first line matches with the first value of the given csv_data list. if lines[0].split(",")[0] == csv_data[0]: readable.close() WroteLabels = True return except FileNotFoundError: print("Creating file...") except IndexError: print("Creating file...") with open(path, "a", encoding="utf-8") as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', lineterminator="\n") csv_writer.writerow(csv_data) # csvfile.close()auth = OAuthHandler(ckey, csecret)auth.set_access_token(atoken, asecret)twitterStream = Stream(auth, listener())twitterStream.filter(track=Query)#locations=[-74.201145,40.722933,-74.148273,40.744786] -- Newark LocationLinks visited: ................
................

In order to avoid copyright disputes, this page is only a partial summary.

Google Online Preview   Download