--- title: "R Notebook" output: html_notebook --- Most of these libraries have been used someone or another in the code. Some were left out however. ```{r} #Necessary Libraries library(lubridate) library(ggpubr) library(xts) library(topicmodels) library(plotly) library(tm) library(sentimentr) library(dplyr) library(syuzhet) library(rvest) library(httr) library(magrittr) library(stringr) library(tm) library(tibble) library(rvest) library(tidytext) library(ggplot2) library(wordcloud) library(tidyr) library(usmap) library(maps) library(ggmap) library(googleAuthR) library(sp) library(maps) library(maptools) library(revgeo) library(choroplethrMaps) library(mapproj) library(SnowballC) library(RColorBrewer) ``` The following code snippet reads the number of reviews on the website for the uber partner drivers site ```{r} number = read_html(paste("https://www.indeed.com/cmp/Uber-Partner-Drivers/reviews?start=0")) %>% html_nodes("b") %>% html_text() number = gsub(pattern = ",", replacement = "", number) #removing the comma so it can be transformed into a number number = as.numeric(as.character(number)) ``` Scraping the tite of the review, the text, the date, and the number of stars given and putting it into a data frame ```{r} page = seq(from = 0, to = number, by = 20) #Deriving the number of pages to be considered uber_drivers = NULL #Creating an empty data frame for (start in (page)) { # For loop to read in the file and append it to a "running total dataframe" called articles} na = c(NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA) title = read_html(paste("https://www.indeed.com/cmp/Uber-Partner-Drivers/reviews?start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) .cmp-review-title span:nth-of-type(1)") %>% html_text() #print(title) date = read_html(paste("https://www.indeed.com/cmp/Uber-Partner-Drivers/reviews?start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) span.cmp-review-date-created") %>% html_text() #print(date) review = read_html(paste("https://www.indeed.com/cmp/Uber-Partner-Drivers/reviews?start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) span[itemprop='reviewBody']") %>% html_text() #print(review) stars = read_html(paste("https://www.indeed.com/cmp/Uber-Partner-Drivers/reviews?start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) div.cmp-ratingNumber") %>% html_text() stars = gsub(pattern = "", replacement = "", stars) stars = as.numeric(stars) #print(stars) city = read_html(paste("https://www.indeed.com/cmp/Uber-Partner-Drivers/reviews?start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) span.cmp-reviewer-job-location") %>% html_text() #The web scraper about 5 times will break if it only finds 19 elements on the page instead of 20. The following set of if statements allows for loop to run even if this occurr by inserting a string of NA's when Web scraper doesn't return the correct number of values if(length(title == 20)) { title = title } else { title = na } if(length(date == 20)) { date = date } else { date = na } if(length(review == 20)) { review = review } else { review = na } if(length(stars == 20)) { stars = stars } else { stars = na } if(length(city == 20)) { city = city } else { city = na } temp = data.frame(title, review, date, stars, city) uber_drivers = rbind(uber_drivers, temp) } uber_drivers = na.omit(uber_drivers) uber_drivers$position = "driver" uber_drivers$date = str_replace_all(uber_drivers$date, ",", "") uber_drivers$date = as.Date(uber_drivers$date, format = "%B %d %Y") #Converting it to date format ``` Sentiment analysis for uber drivers, grouping by month, and plotting over time ```{r} uber_drivers$sentiment = as.numeric(get_sentiment(as.character(uber_drivers$review))) #Getting sentiment uber_drivers2 = uber_drivers %>% group_by(date) %>% summarise(mean(sentiment)) names(uber_drivers2) = c("date", "sentiment") #uber_drivers2 %>% ggplot(aes(x=date, y=sentiment)) + geom_col(color="red") + xlab("Sequence") + #ylab("Sentiment") + ggtitle("Driver Sentiment") + #theme(plot.title = element_text(hjust = 0.5)) + geom_smooth(method = "auto") + geom_vline(xintercept = as.numeric(uber_drivers$date[3267])) uber_drivers_monthly = uber_drivers2 %>% #Aggregating data into month bins group_by(month=floor_date(date, "month")) %>% summarise(sentiment = mean(sentiment)) uber_drivers_monthly %>% ggplot(aes(x=month, y=sentiment)) + geom_col(color="black") + xlab("Sequence") + ylab("Sentiment") + ggtitle("Driver Sentiment") + theme(plot.title = element_text(hjust = 0.4)) + geom_smooth(method = "auto") + geom_vline(xintercept = as.numeric(uber_drivers$date[3267])) + ggtitle("Sentiment over Time for Uber Drivers") ``` The following code snippet reads the number of reviews on the website for the uber page ```{r} number_employees = read_html(paste("https://www.indeed.com/cmp/Uber/reviews?start=0")) %>% html_nodes("b") %>% html_text() number_employees = gsub(pattern = ",", replacement = "", number_employees) #removing the comma so it can be transformed into a number number_employees = as.numeric(as.character(number_employees)) page2 = seq(from = 0, to = (number_employees), by = 20) uber_employees = NULL #Creating an empty data frame for (start in (page2)) { # For loop to read in the file and append it to a "running total dataframe" called articles} na = c(NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA) title = read_html(paste("https://www.indeed.com/cmp/Uber/reviews?start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) .cmp-review-title span:nth-of-type(1)") %>% html_text() #print(title) date = read_html(paste("https://www.indeed.com/cmp/Uber/reviews?start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) span.cmp-review-date-created") %>% html_text() #print(date) review = read_html(paste("https://www.indeed.com/cmp/Uber/reviews?start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) span[itemprop='reviewBody']") %>% html_text() #print(review) stars = read_html(paste("https://www.indeed.com/cmp/Uber/reviews?start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) div.cmp-ratingNumber") %>% html_text() stars = gsub(pattern = "", replacement = "", stars) stars = as.numeric(stars) #print(stars) position = read_html(paste("https://www.indeed.com/cmp/Uber/reviews?start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) span.cmp-reviewer") %>% html_text() position = gsub(pattern = "", replacement = "", position) #print(position) if(length(title == 20)) { title = title } else { title = na } if(length(date == 20)) { date = date } else { date = na } if(length(review == 20)) { review = review } else { review = na } if(length(stars == 20)) { stars = stars } else { stars = na } if(length(position == 20)) { position = position } else { position = na } temp = data.frame(title, review, date, stars, position) uber_employees = rbind(uber_employees, temp) } ``` Converting the dates to be read in date format ```{r} uber_employees = na.omit(uber_employees) uber_employees$date= str_replace_all(uber_employees$date, ",", "") uber_employees$date = as.Date(uber_employees$date, format = "%B %d %Y") #Converting it to date format ``` Plotting the sentiment for uber drivers over time Sentiment analysis for uber employees and then grouping by month and plotting over time ```{r} uber_employees$sentiment =as.numeric(get_sentiment(as.character(uber_employees$review))) #Getting sentiment uber_employees2 = uber_employees %>% #mutate(month = format(date, "%m"), year = format(date, "%Y")) %>% group_by(date) %>% summarise(mean(sentiment)) ##uber_employees2$date = as.Date(uber_employees2$date, format = "%M %Y") names(uber_employees2) = c("date", "sentiment") uber_employees_monthly = uber_employees2 %>% #Aggregating data into month bins group_by(month=floor_date(date, "month")) %>% summarise(sentiment = mean(sentiment)) uber_employees_monthly %>% ggplot(aes(x=month, y=sentiment)) + geom_col(color="white") + xlab("Date") + ylab("Sentiment") + ggtitle("EmployeeSentiment") + theme(plot.title = element_text(hjust = 0.5)) + geom_smooth(method = "auto") + geom_vline(xintercept = as.numeric(uber_drivers$date[3267])) + ggtitle("Sentiment over Time for Uber Employees") ``` Uber Drivers versus Employees Sentiment ```{r} # Area plot uber_drivers_monthly$type = "driver" #Adding the fact that the review comes from a driver uber_employees_monthly$type = "employee" #Adding the fact that the review comes from an employee all_uber = rbind(uber_drivers_monthly, uber_employees_monthly) ggplot(all_uber, aes(x = month, y = sentiment)) + geom_area(aes(color = type, fill = type), alpha = 0.3, position = position_dodge(0.8)) + scale_color_manual(values = c("#00AFBB", "#E7B800")) + scale_fill_manual(values = c("#00AFBB", "#E7B800")) + geom_vline(xintercept = as.numeric(uber_drivers$date[3267])) +xlab("Date") + ylab("Sentiment") + ggtitle("Sentiment over Time by Worker Type") + scale_x_date(limits = as.Date(c("2014-01-01","2019-05-05"))) ``` The following code snippet divides drives into a before and after the new ceo and then only takes the bad reviews and performs an LDA classification. The following six snippets weren't really included in the project ```{r} uber_drivers_before = subset(uber_drivers, date>=as.Date("2010-11-25") & date<=as.Date("2017-08-27")) #Before the New CEO #uber_drivers_before = uber_drivers_before %>% #filter(stars <=2) uber_drivers_after = subset(uber_drivers, date>=as.Date("2017-08-28") & date<=as.Date("2020-01-01")) #After the new ceo #uber_drivers_after = uber_drivers_after %>% #filter(stars <=2) ``` ```{r} #Crating a corpus of the reviews and cleaning it up reviews_before = VCorpus(VectorSource(uber_drivers_before$review)) reviews_after = VCorpus(VectorSource(uber_drivers_after$review)) reviews_before = reviews_before %>% tm_map(removeWords, stopwords("english")) %>% #Remove stopwords tm_map(stripWhitespace) %>% #Remove Whitespace tm_map(content_transformer(tolower)) %>% #Convert to lowercase tm_map(removePunctuation) %>% #Remove punctuation tm_map(removeNumbers) %>% #Remove numbers tm_map(content_transformer(stemDocument) ,lazy=TRUE) %>% #Stemming words tm_map(content_transformer(removeWords), c("and", "said", "but", "the") ,lazy=TRUE) #Taking out further unnecessecary words reviews_after = reviews_after %>% tm_map(removeWords, stopwords("english")) %>% #Remove stopwords tm_map(stripWhitespace) %>% #Remove Whitespace tm_map(content_transformer(tolower)) %>% #Convert to lowercase tm_map(removePunctuation) %>% #Remove punctuation tm_map(removeNumbers) %>% #Remove numbers tm_map(content_transformer(stemDocument) ,lazy=TRUE) %>% #Stemming words tm_map(content_transformer(removeWords), c("and", "said", "but", "the") ,lazy=TRUE) #Taking out further unnecessecary words dtm_before = DocumentTermMatrix(reviews_before) dtm_after = DocumentTermMatrix(reviews_after) #Removing sparse terms dtms_before = removeSparseTerms(dtm_before, .99) dtms_after = removeSparseTerms(dtm_after, .99) ``` ```{r} #Building an LDA Model dtm_matrix_lda_before = as.matrix(dtms_before) dtm_matrix_lda_after = as.matrix(dtms_after) terms_before = rowSums(dtm_matrix_lda_before) != 0 #Finding values that never appear dtm_matrix_lda_before = dtm_matrix_lda_before[terms_before,] #Clearing out values that never appear terms_after = rowSums(dtm_matrix_lda_after) != 0 #Finding values that never appear dtm_matrix_lda_after = dtm_matrix_lda_after[terms_after,] #Clearing out values that never appear lda_uber_before <-LDA(dtm_matrix_lda_before, 12, method="Gibbs", control = list(seed = 1234)) terms(lda_uber_before,10) lda_uber_after <-LDA(dtm_matrix_lda_after, 12, method="Gibbs", control = list(seed = 1234)) terms(lda_uber_after,10) ``` The following three code snippets divide uber employees into two groups and perform an LDA on each ```{r} #Dividing Uber employees into two subgroups uber_employees2 = uber_employees %>% filter(position != "Driver"& position != "DRIVER" & position != "Uber Driver") uber_employees_before = subset(uber_employees, date>=as.Date("2010-11-25") & date<=as.Date("2017-08-27")) #Before the New CEO uber_employees_after = subset(uber_employees, date>=as.Date("2017-08-28") & date<=as.Date("2020-01-01")) #After the new ceo ``` ```{r} #Creating a corpus of the employee reviews and cleaning it up employee_reviews_before = VCorpus(VectorSource(uber_drivers_before$review)) employee_reviews_after = VCorpus(VectorSource(uber_drivers_after$review)) employee_reviews_before = employee_reviews_before %>% tm_map(removeWords, stopwords("english")) %>% #Remove stopwords tm_map(stripWhitespace) %>% #Remove Whitespace tm_map(content_transformer(tolower)) %>% #Convert to lowercase tm_map(removePunctuation) %>% #Remove punctuation tm_map(removeNumbers) %>% #Remove numbers tm_map(content_transformer(stemDocument) ,lazy=TRUE) %>% #Stemming words tm_map(content_transformer(removeWords), c("and", "said", "but", "the") ,lazy=TRUE) #Taking out further unnecessecary words employee_reviews_after = employee_reviews_after %>% tm_map(removeWords, stopwords("english")) %>% #Remove stopwords tm_map(stripWhitespace) %>% #Remove Whitespace tm_map(content_transformer(tolower)) %>% #Convert to lowercase tm_map(removePunctuation) %>% #Remove punctuation tm_map(removeNumbers) %>% #Remove numbers tm_map(content_transformer(stemDocument) ,lazy=TRUE) %>% #Stemming words tm_map(content_transformer(removeWords), c("and", "said", "but", "the") ,lazy=TRUE) #Taking out further unnecessecary words employee_dtm_before = DocumentTermMatrix(employee_reviews_before) employee_dtm_after = DocumentTermMatrix(employee_reviews_after) #Removing sparse terms employee_dtms_before = removeSparseTerms(employee_dtm_before, .99) employee_dtms_after = removeSparseTerms(employee_dtm_after, .99) ``` ```{r} #Building an LDA Model for employees employee_dtm_matrix_before = as.matrix(employee_dtms_before) employee_dtm_matrix_after = as.matrix(employee_dtms_after) employee_terms_before = rowSums(employee_dtm_matrix_before) != 0 #Finding values that never appear employee_dtm_matrix_before = employee_dtm_matrix_before[employee_terms_before,] #Clearing out values that never appear employee_terms_after = rowSums(employee_dtm_matrix_after) != 0 #Finding values that never appear employee_dtm_matrix_after = employee_dtm_matrix_after[employee_terms_after,] #Clearing out values that never appear lda_uber_employee_before <-LDA(employee_dtm_matrix_before, 8, method="Gibbs", control = list(seed = 1234)) terms(lda_uber_employee_before,8) lda_uber_employee_after <-LDA(employee_dtm_matrix_after, 8, method="Gibbs", control = list(seed = 1234)) terms(lda_uber_employee_after,8) ``` Looking at the cities with the best reviews ```{r} uber_driver_city = uber_drivers %>% group_by(city) %>% summarise(mean(sentiment)) names(uber_driver_city) = c("city", "sentiment") uber_driver_city =uber_driver_city %>% arrange(desc(sentiment)) uber_driver_city$city2 = gsub(pattern = ",", replacement = "", uber_driver_city$city) #Taking out the comma to make it readable to google ``` The following code prepares the datafame for state analysis by geocoding and providing lat long. This solves a lot of trouble with the messy data ```{r} uber_driver_city <- uber_driver_city %>% mutate_geocode(city2) #Adding a lat/lon column for each city ``` The following code prepares the datafame for state analysis ```{r} uber_driver_city2 = uber_driver_city #Duplicating the dataframe as it takes a long time for google to geocode latlong2state <- function(pointsDF) { #Convets all the coordinates back to state codes # Prepare SpatialPolygons object with one SpatialPolygon # per state (plus DC, minus HI & AK) states <- map('state', fill=TRUE, col="transparent", plot=FALSE) IDs <- sapply(strsplit(states$names, ":"), function(x) x[1]) states_sp <- map2SpatialPolygons(states, IDs=IDs, proj4string=CRS("+proj=longlat +datum=wgs84")) # Convert pointsDF to a SpatialPoints object pointsSP <- SpatialPoints(pointsDF, proj4string=CRS("+proj=longlat +datum=wgs84")) # Use 'over' to get _indices_ of the Polygons object containing each point indices <- over(pointsSP, states_sp) # Return the state names of the Polygons object containing each point stateNames <- sapply(states_sp@polygons, function(x) x@ID) stateNames[indices] } uber_driver_city3 = uber_driver_city2 uber_driver_city3 = na.omit(uber_driver_city3) #Removing NA values uber_driver_city3$state = latlong2state(uber_driver_city3[4:5]) #Adding a column of state codes uber_driver_city3 = na.omit(uber_driver_city3)#Removing NA values states = read.csv("statelatlong.csv") #A data fame with states and state codes states = states[, c(-2,-3)] states$City = tolower(states$City) #converting state names to lowercase uber_driver_city4 = left_join(uber_driver_city3, states, by = c("state" ="City")) #AAssociating each state with a code to be read into plotly ``` Which Cities are the best for Uber Drivers ```{r} uber_driver_city10 = uber_driver_city2 uber_driver_city10$coordinates = paste(uber_driver_city10$lat, uber_driver_city10$lon, sep = ",") #Pasting the coordinates together uber_driver_city10$cityname = geocode(uber_driver_city10$coordinates, output = "more") uber_driver_city10 = na.omit(uber_driver_city10) uber_driver_city10=uber_driver_city10[-c(18,156,221,287,289,405,507,732,793,995,1016,1083),] #The following code pulls out the city name only because the data was very messy pat="(,.\\w+,)|(,.\\w+.\\w+,)" uber_driver_city10$realcityname = gsub("(,\\s)|,","",regmatches(m<-strsplit(uber_driver_city10$cityname[[5]],"\\|"),regexpr(pat,m))) uber_driver_city11 = uber_driver_city10 %>% group_by(realcityname) %>% summarise(mean(sentiment)) names(uber_driver_city11) = c("city", "sentiment") uber_driver_city11 = uber_driver_city11 %>% arrange(desc(sentiment)) uber_driver_city11$city <- factor(uber_driver_city11$city, levels = uber_driver_city11$city[order(uber_driver_city11$sentiment)]) best_cities2 = ggplot(uber_driver_city11[1:20,], aes(x=city, y=sentiment)) + geom_bar(stat="identity", width=.5, fill="black") + labs(title="Average Sentiment by City") + theme(axis.text.x = element_text(angle=65, vjust=0.6)) best_cities2 ``` Old Version of best cities code ```{r} uber_driver_city20 = uber_driver_city[1:20,] uber_driver_city20 =uber_driver_city20 %>% arrange(desc(sentiment)) uber_driver_city20$city <- factor(uber_driver_city20$city, levels = uber_driver_city20$city[order(uber_driver_city20$sentiment)]) best_cities = ggplot(uber_driver_city20, aes(x=city, y=sentiment)) + geom_bar(stat="identity", width=.5, fill="black") + labs(title="Average Sentiment by City") + theme(axis.text.x = element_text(angle=65, vjust=0.6)) best_cities ``` Code for building the interactive map ```{r} uber_driver_city5 = uber_driver_city4 %>% #Creating a data frame grrouping by state and finding the mean sentiment for drivers group_by(State) %>% summarise(mean(sentiment)) names(uber_driver_city5) = c("State", "Sentiment") # give state boundaries a white border l <- list(color = toRGB("white"), width = 2) # specify some map projection/options map <- list( scope = 'usa', projection = list(type = 'albers usa'), showlakes = TRUE, lakecolor = toRGB('lightblue')) uber_state_map <- plot_geo(uber_driver_city5, locationmode = 'USA-states') %>% add_trace( z = ~Sentiment, locations = ~State, color = ~Sentiment, colors = 'Oranges' ) %>% colorbar(title = "Sentiment Level") %>% layout( title = 'Historical Uber Driver Sentiment by State
(Hover for breakdown)', geo = map ) uber_state_map ``` Exporting the widget to be embedded in the website ```{r} Sys.setenv("plotly_username"="mikeberg") Sys.setenv("plotly_api_key"="WBhxjuhZeFMD1Y9M9qbl") chart_link = api_create(uber_state_map, filename="choropleth") #chart_link ``` Text Analysis First Cleaning the Text ```{r} uber_driver_text = uber_drivers #Crating a corpus of the reviews and cleaning it up reviews = VCorpus(VectorSource(uber_driver_text$review)) reviews = reviews %>% tm_map(removeWords, stopwords("english")) %>% #Remove stopwords tm_map(stripWhitespace) %>% #Remove Whitespace tm_map(content_transformer(tolower)) %>% #Convert to lowercase tm_map(removePunctuation) %>% #Remove punctuation tm_map(removeNumbers) #Remove numbers dtm_all = DocumentTermMatrix(reviews) #Removing sparse terms dtms_all = removeSparseTerms(dtm_all, .99) ``` Logistic Regression for Words Assoiciated Positively ```{r} #Feature Selection Positive dtm_matrix_selection = as.matrix(dtms_all) uber_driver_text$positive = uber_driver_text$stars == 4 | uber_driver_text$stars == 5 #Creating a row of true or false value to see if the post was related to popular sentiment or not corr_positive = cor(uber_driver_text$positive, dtm_matrix_selection) top20_positive = order(corr_positive, decreasing=T)[1:20] top20_positive_words = colnames(corr_positive)[top20_positive] top20_positive_words ``` ```{r} #Building a logistic regression to see which words lead to most positive reviews logistic_positive = as.data.frame(cbind(positive = uber_driver_text$positive, dtm_matrix_selection[,top20_positive_words])) pedictive_positive = glm(positive~., data=logistic_positive, family=binomial) #summary(pedictive_positive) ``` ```{r} #Words most significantly associated with leaving good review coef_positive = coef(pedictive_positive)[-1] #pulling out the coefficients positive.terms = coef_positive[coef_positive>0] #Which coefficients are positive top.positive = sort(positive.terms,decreasing=T)[1:19] #Sorting coefficients by magnitude top.positive.frame = as.data.frame(top.positive) top.positive.frame = rownames_to_column(top.positive.frame) names(top.positive.frame) = c("word", "magnitude") wordcloud(words = top.positive.frame$word, freq = top.positive.frame$magnitude, max.words=200, random.order=FALSE, rot.per=.2, colors=brewer.pal(8, "Dark2"), scale=c(4,.5)) ``` Feature Selection for words associated with negative Reviews ```{r} #Feature Selection Negative uber_driver_text$negative = uber_driver_text$stars == 1 | uber_driver_text$stars == 2 #Creating a row of true or false value to see if the post was related to negative sentiment corr_negative = cor(uber_driver_text$negative, dtm_matrix_selection) top20_negative = order(corr_negative, decreasing=T)[1:20] top20_negative_words = colnames(corr_negative)[top20_negative] top20_negative_words ``` ```{r} #Building a logistic regression to see which words lead to most negative reviews logistic_negative = as.data.frame(cbind(negative = uber_driver_text$negative, dtm_matrix_selection[,top20_negative_words])) pedictive_negative = glm(negative~., data=logistic_negative, family=binomial) #summary(pedictive_negative) ``` ```{r} #Words most significantly associated with leaving bad cultural review coef_negative = coef(pedictive_negative)[-1] negative.terms = coef_negative[coef_negative>0] top.negative = sort(negative.terms,decreasing=T)[1:20] top.negative.frame = as.data.frame(top.negative) top.negative.frame = rownames_to_column(top.negative.frame) names(top.negative.frame) = c("word", "magnitude") wordcloud(words = top.negative.frame$word, freq = top.negative.frame$magnitude, max.words=200, random.order=FALSE, rot.per=.2, colors=brewer.pal(8, "Dark2"), scale=c(6,.5)) ``` Building an LDA model for Uber Reviews ```{r} #Building an LDA Model dtm_matrix_lda = as.matrix(dtms_all) terms = rowSums(dtm_matrix_lda) != 0 #Finding values that never appear dtm_matrix_lda = dtm_matrix_lda[terms,] #Clearing out values that never appear ldaOut <-LDA(dtm_matrix_lda, 10, method="Gibbs", control = list(seed = 1234)) terms(ldaOut,10) ``` ```{r} #LDA Model for words associate with positive Revies, that is for reviews with star ratings of 4 or 5 dtm_matrix_lda_positive = as.matrix(dtms_all) dtm_matrix_lda_positive = dtm_matrix_lda_positive[which(uber_driver_text$positive==TRUE),] #Taking only rows associated with positive sentiment terms2 = rowSums(dtm_matrix_lda_positive) != 0 dtm_matrix_lda_positive = dtm_matrix_lda_positive[terms2,] lda_positive <-LDA(dtm_matrix_lda_positive, 9, method="Gibbs", control = list(seed = 1234)) terms(lda_positive,10) ``` ```{r} #LDA Model for words associate with negative Revies, that is for reviews with star ratings of 1 or 2 dtm_matrix_lda_negative = as.matrix(dtms_all) uber_driver_text$negative = uber_driver_text$stars <= 2 #Creating a row of bad reviews dtm_matrix_lda_negative = dtm_matrix_lda_negative[which(uber_driver_text$negative==TRUE),] #Taking only the rows with bad reviews terms3 = rowSums(dtm_matrix_lda_negative) != 0 dtm_matrix_lda_negative = dtm_matrix_lda_negative[terms3,] lda_negative <-LDA(dtm_matrix_lda_negative,9, method="Gibbs", control = list(seed = 1234)) terms(lda_negative,10) ``` ```{r} #Looking at the words associated with positive reviews using beta analysis positive_words <- tidy(lda_positive, matrix = "beta") postive_top_terms <- positive_words %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta) postive_top_terms %>% mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + coord_flip() ``` ```{r} #Looking at the words associated with negative reviews using beta analysis negative_words <- tidy(lda_negative, matrix = "beta") negative_top_terms <- negative_words %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta) negative_top_terms %>% mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + coord_flip() ``` ```{r} #The following code snippet reads the number of reviews on the website for the lyft page number_lyft = read_html(paste("https://www.indeed.com/cmp/Lyft/reviews?fjobtitle=Driver&start=0")) %>% html_nodes("b") %>% html_text() number_lyft = gsub(pattern = ",", replacement = "", number_lyft) #removing the comma so it can be transformed into a number number_lyft = as.numeric(as.character(number_lyft)) ``` ```{r} #Scraping the tite of the review, the text, the date, and the number of stars given and putting it into a data frame page = seq(from = 0, to = number_lyft, by = 20) #Deriving the number of pages to be considered lyft_drivers = NULL #Creating an empty data frame for (start in (page)) { # For loop to read in the file and append it to a "running total dataframe" called articles} na = c(NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA) title = read_html(paste("https://www.indeed.com/cmp/Lyft/reviews?fjobtitle=Driver&start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) .cmp-review-title span:nth-of-type(1)") %>% html_text() #print(title) date = read_html(paste("https://www.indeed.com/cmp/Lyft/reviews?fjobtitle=Driver&start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) span.cmp-review-date-created") %>% html_text() #print(date) review = read_html(paste("https://www.indeed.com/cmp/Lyft/reviews?fjobtitle=Driver&start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) span[itemprop='reviewBody']") %>% html_text() #print(review) stars = read_html(paste("https://www.indeed.com/cmp/Lyft/reviews?fjobtitle=Driver&start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) div.cmp-ratingNumber") %>% html_text() stars = gsub(pattern = "", replacement = "", stars) stars = as.numeric(stars) #print(stars) city = read_html(paste("https://www.indeed.com/cmp/Lyft/reviews?fjobtitle=Driver&start=", start, sep = '')) %>% html_nodes("div[itemprop='review']:nth-of-type(n+2) span.cmp-reviewer-job-location") %>% html_text() #The web scraper about 5 times will break if it only finds 19 elements on the page instead of 20. The following set of if statements allows for loop to run even if this occurr by inserting a string of NA's when Web scraper doesn't return the correct number of values if(length(title == 20)) { title = title } else { title = na } if(length(date == 20)) { date = date } else { date = na } if(length(review == 20)) { review = review } else { review = na } if(length(stars == 20)) { stars = stars } else { stars = na } if(length(city == 20)) { city = city } else { title = na } temp = data.frame(title, review, date, stars, city) lyft_drivers = rbind(lyft_drivers, temp) } lyft_drivers = na.omit(lyft_drivers) lyft_drivers$position = "driver" lyft_drivers$date = str_replace_all(lyft_drivers$date, ",", "") lyft_drivers$date = as.Date(lyft_drivers$date, format = "%B %d %Y") #Converting it to date format ``` ```{r} lyft_drivers$sentiment = as.numeric(get_sentiment(as.character(lyft_drivers$review))) #Getting sentiment lyft_drivers2 = lyft_drivers %>% group_by(date) %>% summarise(mean(sentiment)) names(lyft_drivers2) = c("date", "sentiment") lyft_drivers_monthly = lyft_drivers2 %>% #Aggregating data into month bins group_by(month=floor_date(date, "month")) %>% summarise(sentiment = mean(sentiment)) lyft_drivers_monthly %>% ggplot(aes(x=month, y=sentiment)) + geom_col(color="pink") + xlab("Sequence") + ylab("Sentiment") + ggtitle("Lyft Driver Sentiment") + theme(plot.title = element_text(hjust = 0.5)) + geom_smooth(method = "auto") +xlab("Date") ``` ```{r} # Area plot uber_drivers_monthly$type=NULL uber_drivers_monthly$company = "uber" lyft_drivers_monthly$company = "lyft" lyft_v_uber = rbind(uber_drivers_monthly, lyft_drivers_monthly) ggplot(lyft_v_uber, aes(x = month, y = sentiment)) + geom_area(aes(color = company, fill = company), alpha = 0.3, position = position_dodge(0.8)) + xlab("Date") + ylab("Sentiment") + ggtitle("Lyft versus Uber", subtitle = "Driver Sentiment over Time")+ scale_x_date(limits = as.Date(c("2014-05-01","2019-05-05"))) ``` Lyft Best Cities Old ```{r} lyft_driver_city = lyft_drivers %>% group_by(city) %>% summarise(mean(sentiment)) names(lyft_driver_city) = c("city", "sentiment") lyft_driver_city =lyft_driver_city %>% arrange(desc(sentiment)) lyft_driver_city$city2 = gsub(pattern = ",", replacement = "", lyft_driver_city$city) #Taking out the comma to make it readable to google ``` ```{r} lyft_driver_city <- lyft_driver_city %>% mutate_geocode(city2) #Adding a lat/lon column for each city ``` ```{r} lyft_driver_city2 = lyft_driver_city #Duplicating the dataframe as it takes a long time for google to geocode latlong2state <- function(pointsDF) { #Convets all the coordinates back to state codes # Prepare SpatialPolygons object with one SpatialPolygon # per state (plus DC, minus HI & AK) states <- map('state', fill=TRUE, col="transparent", plot=FALSE) IDs <- sapply(strsplit(states$names, ":"), function(x) x[1]) states_sp <- map2SpatialPolygons(states, IDs=IDs, proj4string=CRS("+proj=longlat +datum=wgs84")) # Convert pointsDF to a SpatialPoints object pointsSP <- SpatialPoints(pointsDF, proj4string=CRS("+proj=longlat +datum=wgs84")) # Use 'over' to get _indices_ of the Polygons object containing each point indices <- over(pointsSP, states_sp) # Return the state names of the Polygons object containing each point stateNames <- sapply(states_sp@polygons, function(x) x@ID) stateNames[indices] } lyft_driver_city3 = lyft_driver_city2 lyft_driver_city3 = na.omit(lyft_driver_city3) #Removing NA values lyft_driver_city3$state = latlong2state(lyft_driver_city3[4:5]) #Adding a column of state codes lyft_driver_city3 = na.omit(lyft_driver_city3)#Removing NA values states = read.csv("statelatlong.csv") #A data fame with states and state codes states = states[, c(-2,-3)] states$City = tolower(states$City) #converting state names to lowercase lyft_driver_city4 = left_join(lyft_driver_city3, states, by = c("state" ="City")) #AAssociating each state with a code to be read into plotly ``` Lyft Driver Best Cities ```{r} lyft_driver_city10 = lyft_driver_city2 lyft_driver_city10$coordinates = paste(lyft_driver_city10$lat, lyft_driver_city10$lon, sep = ",") #Pasting the coordinates together lyft_driver_city10$cityname = geocode(lyft_driver_city10$coordinates, output = "more") lyft_driver_city10 = na.omit(lyft_driver_city10) lyft_driver_city10 lyft_driver_city10=lyft_driver_city10[-c(20, 144),] #The following code pulls out the city name only because the data was very messy pat="(,.\\w+,)|(,.\\w+.\\w+,)" lyft_driver_city10$realcityname = gsub("(,\\s)|,","",regmatches(m<-strsplit(lyft_driver_city10$cityname[[5]],"\\|"),regexpr(pat,m))) lyft_driver_city10$realcityname = gsub("(,\\s)|,","",regmatches(m<-strsplit(lyft_driver_city10$cityname[[5]],"\\|"),regexpr(pat,m))) lyft_driver_city11 = lyft_driver_city10 %>% group_by(realcityname) %>% summarise(mean(sentiment)) names(lyft_driver_city11) = c("city", "sentiment") lyft_driver_city11 = lyft_driver_city11 %>% arrange(desc(sentiment)) lyft_driver_city11$city <- factor(lyft_driver_city11$city, levels = lyft_driver_city11$city[order(lyft_driver_city11$sentiment)]) lyft_best_cities2 = ggplot(lyft_driver_city11[1:20,], aes(x=city, y=sentiment)) + geom_bar(stat="identity", width=.5, fill="hotpink") + labs(title="Lyft Average Sentiment by City") + theme(axis.text.x = element_text(angle=65, vjust=0.6)) lyft_best_cities2 ``` ```{r} lyft_driver_city5 = lyft_driver_city4 %>% #Creating a data frame grrouping by state and finding the mean sentiment for drivers group_by(State) %>% summarise(mean(sentiment)) names(lyft_driver_city5) = c("State", "Sentiment") # give state boundaries a white border l <- list(color = toRGB("white"), width = 2) # specify some map projection/options lyft_map <- list( scope = 'usa', projection = list(type = 'albers usa'), showlakes = TRUE, lakecolor = toRGB('lightblue')) lyft_state_map <- plot_geo(lyft_driver_city5, locationmode = 'USA-states') %>% add_trace( z = ~Sentiment, locations = ~State, color = ~Sentiment, colors = 'Reds' ) %>% colorbar(title = "Sentiment Level") %>% layout( title = 'Historical lyft Driver Sentiment by State
(Hover for breakdown)', geo = map ) lyft_state_map #chart_link_lyft = api_create(lyft_state_map, filename="lyft_choropleth") #chart_link_lyft #Exporting the Lyft state map to be embedded ``` Performing a two sample t-test to see if sentiment amongst uber drivers was better than that amongsth lyft drivers ```{r} t.test(uber_drivers$sentiment, lyft_drivers$sentiment, paired = FALSE, alternative = "two.sided") ``` Perfirming a t-test between uber drivers before and after the CEO switch ```{r} t.test(uber_drivers_before$sentiment, uber_drivers_after$sentiment, paired = FALSE, alternative = "two.sided") ``` Perfirming a t-test between uber **Employees before and after the CEO switch ```{r} t.test(uber_employees_before$sentiment, uber_employees_after$sentiment, paired = FALSE, alternative = "two.sided") ``` ```{r} t.test(uber_drivers$sentiment, uber_employees$sentiment, paired = FALSE, alternative = "two.sided") ``` Looking to see if there is a correlation betwee ```{r} minimum_wage = read.csv("Minimum Wage Data.csv") minimum_wage$State = tolower(minimum_wage$State) minimum_wage_avg = minimum_wage %>% filter(Year == 2012 | Year == 2013 |Year == 2014 |Year == 2015 |Year == 2016 |Year == 2017 |Year == 2018 |Year == 2019) %>% group_by(State) %>% summarise(mean(High.Value)) minimum_wage_avg = left_join(minimum_wage_avg, states, by = c("State" ="City")) minimum_wage_avg = na.omit(minimum_wage_avg) collective = left_join(uber_driver_city5, minimum_wage_avg, by = c("State" ="State.y")) names(collective) = c("Code", "Sentiment", "State", "MinWage") sentiment_wage_correlation = cor(collective$Sentiment, collective$MinWage) #Looking at the correlation between sentiment and minimum wage. A negative correlation is expected, because the higher the minimum wage, the more discontent the driver would be by riding with uber sentiment_wage_correlation #Looking at the correlation of state minimum wage and sentiment ``` Creating a Scatterplot ```{r} ggscatter(collective, x = "MinWage", y = "Sentiment", add = "reg.line", conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson", xlab = "Minimum Wage", ylab = "Sentiment") ``` The following block of code looked to see if variables by in census data were able to predict sentiment by state. Unfortunately, I couldn't build the model because the census metrics weren't as relevant as I thought ```{r} census = read.csv("acs2017_county_data.csv") census_state = census %>% #Collapsing by state and computing the average of several potentially important metrics by state group_by(State) %>% summarise(median(Income), mean(IncomePerCap), mean(Poverty), mean(Professional), mean(Service), mean(Office), mean(Office), mean(Construction), mean(Production), mean(Drive), mean(Carpool), mean(Transit), mean(Walk), mean(WorkAtHome), mean(MeanCommute), mean(Employed), mean(Unemployment)) census_state$State = tolower(census_state$State) #Making state names lowercase census_state = left_join(census_state, states, by = c("State" ="City")) #Joining with state code census_state = na.omit(census_state) #Removing NA values names(census_state)[18] = c("State_Code") census_uber_state = left_join(uber_driver_city5, census_state, by = c("State" ="State_Code")) #Merging uber driver sentiment and other census metrics census_uber_state = census_uber_state[,-3] #Removing the actual state names getwd() write.csv(census_uber_state, file = "uber.csv",row.names=TRUE) ``` ```{r} all_states = NULL inner_join(uber_driver_city5, lyft_driver_city5, by = "State") sentiment_correlation = NULL cor(all_states$Sentiment.x, all_states$Sentiment.y) sentiment_correlation ```