---
title: "R Notebook"
output: html_notebook
---
Most of these libraries have been used someone or another in the code. Some were left out however.
```{r}
#Necessary Libraries
library(lubridate)
library(ggpubr)
library(xts)
library(topicmodels)
library(plotly)
library(tm)
library(sentimentr)
library(dplyr)
library(syuzhet)
library(rvest)
library(httr)
library(magrittr)
library(stringr)
library(tm)
library(tibble)
library(rvest)
library(tidytext)
library(ggplot2)
library(wordcloud)
library(tidyr)
library(usmap)
library(maps)
library(ggmap)
library(googleAuthR)
library(sp)
library(maps)
library(maptools)
library(revgeo)
library(choroplethrMaps)
library(mapproj)
library(SnowballC)
library(RColorBrewer)
```
The following code snippet reads the number of reviews on the website for the uber partner drivers site
```{r}
number = read_html(paste("https://www.indeed.com/cmp/Uber-Partner-Drivers/reviews?start=0")) %>%
html_nodes("b") %>%
html_text()
number = gsub(pattern = ",", replacement = "", number) #removing the comma so it can be transformed into a number
number = as.numeric(as.character(number))
```
Scraping the tite of the review, the text, the date, and the number of stars given and putting it into a data frame
```{r}
page = seq(from = 0, to = number, by = 20) #Deriving the number of pages to be considered
uber_drivers = NULL #Creating an empty data frame
for (start in (page)) { # For loop to read in the file and append it to a "running total dataframe" called articles}
na = c(NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA)
title = read_html(paste("https://www.indeed.com/cmp/Uber-Partner-Drivers/reviews?start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) .cmp-review-title span:nth-of-type(1)") %>%
html_text()
#print(title)
date = read_html(paste("https://www.indeed.com/cmp/Uber-Partner-Drivers/reviews?start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) span.cmp-review-date-created") %>%
html_text()
#print(date)
review = read_html(paste("https://www.indeed.com/cmp/Uber-Partner-Drivers/reviews?start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) span[itemprop='reviewBody']") %>%
html_text()
#print(review)
stars = read_html(paste("https://www.indeed.com/cmp/Uber-Partner-Drivers/reviews?start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) div.cmp-ratingNumber") %>%
html_text()
stars = gsub(pattern = "", replacement = "", stars)
stars = as.numeric(stars)
#print(stars)
city = read_html(paste("https://www.indeed.com/cmp/Uber-Partner-Drivers/reviews?start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) span.cmp-reviewer-job-location") %>%
html_text()
#The web scraper about 5 times will break if it only finds 19 elements on the page instead of 20. The following set of if statements allows for loop to run even if this occurr by inserting a string of NA's when Web scraper doesn't return the correct number of values
if(length(title == 20)) {
title = title
} else {
title = na
}
if(length(date == 20)) {
date = date
} else {
date = na
}
if(length(review == 20)) {
review = review
} else {
review = na
}
if(length(stars == 20)) {
stars = stars
} else {
stars = na
}
if(length(city == 20)) {
city = city
} else {
city = na
}
temp = data.frame(title, review, date, stars, city)
uber_drivers = rbind(uber_drivers, temp)
}
uber_drivers = na.omit(uber_drivers)
uber_drivers$position = "driver"
uber_drivers$date = str_replace_all(uber_drivers$date, ",", "")
uber_drivers$date = as.Date(uber_drivers$date, format = "%B %d %Y") #Converting it to date format
```
Sentiment analysis for uber drivers, grouping by month, and plotting over time
```{r}
uber_drivers$sentiment = as.numeric(get_sentiment(as.character(uber_drivers$review))) #Getting sentiment
uber_drivers2 = uber_drivers %>%
group_by(date) %>%
summarise(mean(sentiment))
names(uber_drivers2) = c("date", "sentiment")
#uber_drivers2 %>% ggplot(aes(x=date, y=sentiment)) + geom_col(color="red") + xlab("Sequence") +
#ylab("Sentiment") + ggtitle("Driver Sentiment") +
#theme(plot.title = element_text(hjust = 0.5)) + geom_smooth(method = "auto") + geom_vline(xintercept = as.numeric(uber_drivers$date[3267]))
uber_drivers_monthly = uber_drivers2 %>% #Aggregating data into month bins
group_by(month=floor_date(date, "month")) %>%
summarise(sentiment = mean(sentiment))
uber_drivers_monthly %>% ggplot(aes(x=month, y=sentiment)) + geom_col(color="black") + xlab("Sequence") +
ylab("Sentiment") + ggtitle("Driver Sentiment") +
theme(plot.title = element_text(hjust = 0.4)) + geom_smooth(method = "auto") + geom_vline(xintercept = as.numeric(uber_drivers$date[3267])) + ggtitle("Sentiment over Time for Uber Drivers")
```
The following code snippet reads the number of reviews on the website for the uber page
```{r}
number_employees = read_html(paste("https://www.indeed.com/cmp/Uber/reviews?start=0")) %>%
html_nodes("b") %>%
html_text()
number_employees = gsub(pattern = ",", replacement = "", number_employees) #removing the comma so it can be transformed into a number
number_employees = as.numeric(as.character(number_employees))
page2 = seq(from = 0, to = (number_employees), by = 20)
uber_employees = NULL #Creating an empty data frame
for (start in (page2)) { # For loop to read in the file and append it to a "running total dataframe" called articles}
na = c(NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA)
title = read_html(paste("https://www.indeed.com/cmp/Uber/reviews?start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) .cmp-review-title span:nth-of-type(1)") %>%
html_text()
#print(title)
date = read_html(paste("https://www.indeed.com/cmp/Uber/reviews?start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) span.cmp-review-date-created") %>%
html_text()
#print(date)
review = read_html(paste("https://www.indeed.com/cmp/Uber/reviews?start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) span[itemprop='reviewBody']") %>%
html_text()
#print(review)
stars = read_html(paste("https://www.indeed.com/cmp/Uber/reviews?start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) div.cmp-ratingNumber") %>%
html_text()
stars = gsub(pattern = "", replacement = "", stars)
stars = as.numeric(stars)
#print(stars)
position = read_html(paste("https://www.indeed.com/cmp/Uber/reviews?start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) span.cmp-reviewer") %>%
html_text()
position = gsub(pattern = "", replacement = "", position)
#print(position)
if(length(title == 20)) {
title = title
} else {
title = na
}
if(length(date == 20)) {
date = date
} else {
date = na
}
if(length(review == 20)) {
review = review
} else {
review = na
}
if(length(stars == 20)) {
stars = stars
} else {
stars = na
}
if(length(position == 20)) {
position = position
} else {
position = na
}
temp = data.frame(title, review, date, stars, position)
uber_employees = rbind(uber_employees, temp)
}
```
Converting the dates to be read in date format
```{r}
uber_employees = na.omit(uber_employees)
uber_employees$date= str_replace_all(uber_employees$date, ",", "")
uber_employees$date = as.Date(uber_employees$date, format = "%B %d %Y") #Converting it to date format
```
Plotting the sentiment for uber drivers over time
Sentiment analysis for uber employees and then grouping by month and plotting over time
```{r}
uber_employees$sentiment =as.numeric(get_sentiment(as.character(uber_employees$review))) #Getting sentiment
uber_employees2 = uber_employees %>%
#mutate(month = format(date, "%m"), year = format(date, "%Y")) %>%
group_by(date) %>%
summarise(mean(sentiment))
##uber_employees2$date = as.Date(uber_employees2$date, format = "%M %Y")
names(uber_employees2) = c("date", "sentiment")
uber_employees_monthly = uber_employees2 %>% #Aggregating data into month bins
group_by(month=floor_date(date, "month")) %>%
summarise(sentiment = mean(sentiment))
uber_employees_monthly %>% ggplot(aes(x=month, y=sentiment)) + geom_col(color="white") + xlab("Date") +
ylab("Sentiment") + ggtitle("EmployeeSentiment") +
theme(plot.title = element_text(hjust = 0.5)) + geom_smooth(method = "auto") + geom_vline(xintercept = as.numeric(uber_drivers$date[3267])) + ggtitle("Sentiment over Time for Uber Employees")
```
Uber Drivers versus Employees Sentiment
```{r}
# Area plot
uber_drivers_monthly$type = "driver" #Adding the fact that the review comes from a driver
uber_employees_monthly$type = "employee" #Adding the fact that the review comes from an employee
all_uber = rbind(uber_drivers_monthly, uber_employees_monthly)
ggplot(all_uber, aes(x = month, y = sentiment)) +
geom_area(aes(color = type, fill = type),
alpha = 0.3, position = position_dodge(0.8)) +
scale_color_manual(values = c("#00AFBB", "#E7B800")) +
scale_fill_manual(values = c("#00AFBB", "#E7B800")) +
geom_vline(xintercept = as.numeric(uber_drivers$date[3267])) +xlab("Date") + ylab("Sentiment") + ggtitle("Sentiment over Time by Worker Type") + scale_x_date(limits = as.Date(c("2014-01-01","2019-05-05")))
```
The following code snippet divides drives into a before and after the new ceo and then only takes the bad reviews and performs an LDA classification. The following six snippets weren't really included in the project
```{r}
uber_drivers_before = subset(uber_drivers, date>=as.Date("2010-11-25") & date<=as.Date("2017-08-27")) #Before the New CEO
#uber_drivers_before = uber_drivers_before %>%
#filter(stars <=2)
uber_drivers_after = subset(uber_drivers, date>=as.Date("2017-08-28") & date<=as.Date("2020-01-01")) #After the new ceo
#uber_drivers_after = uber_drivers_after %>%
#filter(stars <=2)
```
```{r}
#Crating a corpus of the reviews and cleaning it up
reviews_before = VCorpus(VectorSource(uber_drivers_before$review))
reviews_after = VCorpus(VectorSource(uber_drivers_after$review))
reviews_before = reviews_before %>%
tm_map(removeWords, stopwords("english")) %>% #Remove stopwords
tm_map(stripWhitespace) %>% #Remove Whitespace
tm_map(content_transformer(tolower)) %>% #Convert to lowercase
tm_map(removePunctuation) %>% #Remove punctuation
tm_map(removeNumbers) %>% #Remove numbers
tm_map(content_transformer(stemDocument) ,lazy=TRUE) %>% #Stemming words
tm_map(content_transformer(removeWords), c("and", "said", "but", "the") ,lazy=TRUE) #Taking out further unnecessecary words
reviews_after = reviews_after %>%
tm_map(removeWords, stopwords("english")) %>% #Remove stopwords
tm_map(stripWhitespace) %>% #Remove Whitespace
tm_map(content_transformer(tolower)) %>% #Convert to lowercase
tm_map(removePunctuation) %>% #Remove punctuation
tm_map(removeNumbers) %>% #Remove numbers
tm_map(content_transformer(stemDocument) ,lazy=TRUE) %>% #Stemming words
tm_map(content_transformer(removeWords), c("and", "said", "but", "the") ,lazy=TRUE) #Taking out further unnecessecary words
dtm_before = DocumentTermMatrix(reviews_before)
dtm_after = DocumentTermMatrix(reviews_after)
#Removing sparse terms
dtms_before = removeSparseTerms(dtm_before, .99)
dtms_after = removeSparseTerms(dtm_after, .99)
```
```{r}
#Building an LDA Model
dtm_matrix_lda_before = as.matrix(dtms_before)
dtm_matrix_lda_after = as.matrix(dtms_after)
terms_before = rowSums(dtm_matrix_lda_before) != 0 #Finding values that never appear
dtm_matrix_lda_before = dtm_matrix_lda_before[terms_before,] #Clearing out values that never appear
terms_after = rowSums(dtm_matrix_lda_after) != 0 #Finding values that never appear
dtm_matrix_lda_after = dtm_matrix_lda_after[terms_after,] #Clearing out values that never appear
lda_uber_before <-LDA(dtm_matrix_lda_before, 12, method="Gibbs", control = list(seed = 1234))
terms(lda_uber_before,10)
lda_uber_after <-LDA(dtm_matrix_lda_after, 12, method="Gibbs", control = list(seed = 1234))
terms(lda_uber_after,10)
```
The following three code snippets divide uber employees into two groups and perform an LDA on each
```{r}
#Dividing Uber employees into two subgroups
uber_employees2 = uber_employees %>%
filter(position != "Driver"& position != "DRIVER" & position != "Uber Driver")
uber_employees_before = subset(uber_employees, date>=as.Date("2010-11-25") & date<=as.Date("2017-08-27")) #Before the New CEO
uber_employees_after = subset(uber_employees, date>=as.Date("2017-08-28") & date<=as.Date("2020-01-01")) #After the new ceo
```
```{r}
#Creating a corpus of the employee reviews and cleaning it up
employee_reviews_before = VCorpus(VectorSource(uber_drivers_before$review))
employee_reviews_after = VCorpus(VectorSource(uber_drivers_after$review))
employee_reviews_before = employee_reviews_before %>%
tm_map(removeWords, stopwords("english")) %>% #Remove stopwords
tm_map(stripWhitespace) %>% #Remove Whitespace
tm_map(content_transformer(tolower)) %>% #Convert to lowercase
tm_map(removePunctuation) %>% #Remove punctuation
tm_map(removeNumbers) %>% #Remove numbers
tm_map(content_transformer(stemDocument) ,lazy=TRUE) %>% #Stemming words
tm_map(content_transformer(removeWords), c("and", "said", "but", "the") ,lazy=TRUE) #Taking out further unnecessecary words
employee_reviews_after = employee_reviews_after %>%
tm_map(removeWords, stopwords("english")) %>% #Remove stopwords
tm_map(stripWhitespace) %>% #Remove Whitespace
tm_map(content_transformer(tolower)) %>% #Convert to lowercase
tm_map(removePunctuation) %>% #Remove punctuation
tm_map(removeNumbers) %>% #Remove numbers
tm_map(content_transformer(stemDocument) ,lazy=TRUE) %>% #Stemming words
tm_map(content_transformer(removeWords), c("and", "said", "but", "the") ,lazy=TRUE) #Taking out further unnecessecary words
employee_dtm_before = DocumentTermMatrix(employee_reviews_before)
employee_dtm_after = DocumentTermMatrix(employee_reviews_after)
#Removing sparse terms
employee_dtms_before = removeSparseTerms(employee_dtm_before, .99)
employee_dtms_after = removeSparseTerms(employee_dtm_after, .99)
```
```{r}
#Building an LDA Model for employees
employee_dtm_matrix_before = as.matrix(employee_dtms_before)
employee_dtm_matrix_after = as.matrix(employee_dtms_after)
employee_terms_before = rowSums(employee_dtm_matrix_before) != 0 #Finding values that never appear
employee_dtm_matrix_before = employee_dtm_matrix_before[employee_terms_before,] #Clearing out values that never appear
employee_terms_after = rowSums(employee_dtm_matrix_after) != 0 #Finding values that never appear
employee_dtm_matrix_after = employee_dtm_matrix_after[employee_terms_after,] #Clearing out values that never appear
lda_uber_employee_before <-LDA(employee_dtm_matrix_before, 8, method="Gibbs", control = list(seed = 1234))
terms(lda_uber_employee_before,8)
lda_uber_employee_after <-LDA(employee_dtm_matrix_after, 8, method="Gibbs", control = list(seed = 1234))
terms(lda_uber_employee_after,8)
```
Looking at the cities with the best reviews
```{r}
uber_driver_city = uber_drivers %>%
group_by(city) %>%
summarise(mean(sentiment))
names(uber_driver_city) = c("city", "sentiment")
uber_driver_city =uber_driver_city %>%
arrange(desc(sentiment))
uber_driver_city$city2 = gsub(pattern = ",", replacement = "", uber_driver_city$city) #Taking out the comma to make it readable to google
```
The following code prepares the datafame for state analysis by geocoding and providing lat long. This solves a lot of trouble with the messy data
```{r}
uber_driver_city <- uber_driver_city %>%
mutate_geocode(city2) #Adding a lat/lon column for each city
```
The following code prepares the datafame for state analysis
```{r}
uber_driver_city2 = uber_driver_city #Duplicating the dataframe as it takes a long time for google to geocode
latlong2state <- function(pointsDF) { #Convets all the coordinates back to state codes
# Prepare SpatialPolygons object with one SpatialPolygon
# per state (plus DC, minus HI & AK)
states <- map('state', fill=TRUE, col="transparent", plot=FALSE)
IDs <- sapply(strsplit(states$names, ":"), function(x) x[1])
states_sp <- map2SpatialPolygons(states, IDs=IDs,
proj4string=CRS("+proj=longlat +datum=wgs84"))
# Convert pointsDF to a SpatialPoints object
pointsSP <- SpatialPoints(pointsDF,
proj4string=CRS("+proj=longlat +datum=wgs84"))
# Use 'over' to get _indices_ of the Polygons object containing each point
indices <- over(pointsSP, states_sp)
# Return the state names of the Polygons object containing each point
stateNames <- sapply(states_sp@polygons, function(x) x@ID)
stateNames[indices]
}
uber_driver_city3 = uber_driver_city2
uber_driver_city3 = na.omit(uber_driver_city3) #Removing NA values
uber_driver_city3$state = latlong2state(uber_driver_city3[4:5]) #Adding a column of state codes
uber_driver_city3 = na.omit(uber_driver_city3)#Removing NA values
states = read.csv("statelatlong.csv") #A data fame with states and state codes
states = states[, c(-2,-3)]
states$City = tolower(states$City) #converting state names to lowercase
uber_driver_city4 = left_join(uber_driver_city3, states, by = c("state" ="City")) #AAssociating each state with a code to be read into plotly
```
Which Cities are the best for Uber Drivers
```{r}
uber_driver_city10 = uber_driver_city2
uber_driver_city10$coordinates = paste(uber_driver_city10$lat, uber_driver_city10$lon, sep = ",") #Pasting the coordinates together
uber_driver_city10$cityname = geocode(uber_driver_city10$coordinates, output = "more")
uber_driver_city10 = na.omit(uber_driver_city10)
uber_driver_city10=uber_driver_city10[-c(18,156,221,287,289,405,507,732,793,995,1016,1083),]
#The following code pulls out the city name only because the data was very messy
pat="(,.\\w+,)|(,.\\w+.\\w+,)"
uber_driver_city10$realcityname = gsub("(,\\s)|,","",regmatches(m<-strsplit(uber_driver_city10$cityname[[5]],"\\|"),regexpr(pat,m)))
uber_driver_city11 = uber_driver_city10 %>%
group_by(realcityname) %>%
summarise(mean(sentiment))
names(uber_driver_city11) = c("city", "sentiment")
uber_driver_city11 = uber_driver_city11 %>%
arrange(desc(sentiment))
uber_driver_city11$city <- factor(uber_driver_city11$city, levels = uber_driver_city11$city[order(uber_driver_city11$sentiment)])
best_cities2 = ggplot(uber_driver_city11[1:20,], aes(x=city, y=sentiment)) +
geom_bar(stat="identity", width=.5, fill="black") +
labs(title="Average Sentiment by City") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
best_cities2
```
Old Version of best cities code
```{r}
uber_driver_city20 = uber_driver_city[1:20,]
uber_driver_city20 =uber_driver_city20 %>%
arrange(desc(sentiment))
uber_driver_city20$city <- factor(uber_driver_city20$city, levels = uber_driver_city20$city[order(uber_driver_city20$sentiment)])
best_cities = ggplot(uber_driver_city20, aes(x=city, y=sentiment)) +
geom_bar(stat="identity", width=.5, fill="black") +
labs(title="Average Sentiment by City") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
best_cities
```
Code for building the interactive map
```{r}
uber_driver_city5 = uber_driver_city4 %>% #Creating a data frame grrouping by state and finding the mean sentiment for drivers
group_by(State) %>%
summarise(mean(sentiment))
names(uber_driver_city5) = c("State", "Sentiment")
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
map <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('lightblue'))
uber_state_map <- plot_geo(uber_driver_city5, locationmode = 'USA-states') %>%
add_trace(
z = ~Sentiment, locations = ~State,
color = ~Sentiment, colors = 'Oranges'
) %>%
colorbar(title = "Sentiment Level") %>%
layout(
title = 'Historical Uber Driver Sentiment by State
(Hover for breakdown)',
geo = map
)
uber_state_map
```
Exporting the widget to be embedded in the website
```{r}
Sys.setenv("plotly_username"="mikeberg")
Sys.setenv("plotly_api_key"="WBhxjuhZeFMD1Y9M9qbl")
chart_link = api_create(uber_state_map, filename="choropleth")
#chart_link
```
Text Analysis
First Cleaning the Text
```{r}
uber_driver_text = uber_drivers
#Crating a corpus of the reviews and cleaning it up
reviews = VCorpus(VectorSource(uber_driver_text$review))
reviews = reviews %>%
tm_map(removeWords, stopwords("english")) %>% #Remove stopwords
tm_map(stripWhitespace) %>% #Remove Whitespace
tm_map(content_transformer(tolower)) %>% #Convert to lowercase
tm_map(removePunctuation) %>% #Remove punctuation
tm_map(removeNumbers) #Remove numbers
dtm_all = DocumentTermMatrix(reviews)
#Removing sparse terms
dtms_all = removeSparseTerms(dtm_all, .99)
```
Logistic Regression for Words Assoiciated Positively
```{r}
#Feature Selection Positive
dtm_matrix_selection = as.matrix(dtms_all)
uber_driver_text$positive = uber_driver_text$stars == 4 | uber_driver_text$stars == 5 #Creating a row of true or false value to see if the post was related to popular sentiment or not
corr_positive = cor(uber_driver_text$positive, dtm_matrix_selection)
top20_positive = order(corr_positive, decreasing=T)[1:20]
top20_positive_words = colnames(corr_positive)[top20_positive]
top20_positive_words
```
```{r}
#Building a logistic regression to see which words lead to most positive reviews
logistic_positive = as.data.frame(cbind(positive = uber_driver_text$positive, dtm_matrix_selection[,top20_positive_words]))
pedictive_positive = glm(positive~., data=logistic_positive, family=binomial)
#summary(pedictive_positive)
```
```{r}
#Words most significantly associated with leaving good review
coef_positive = coef(pedictive_positive)[-1] #pulling out the coefficients
positive.terms = coef_positive[coef_positive>0] #Which coefficients are positive
top.positive = sort(positive.terms,decreasing=T)[1:19] #Sorting coefficients by magnitude
top.positive.frame = as.data.frame(top.positive)
top.positive.frame = rownames_to_column(top.positive.frame)
names(top.positive.frame) = c("word", "magnitude")
wordcloud(words = top.positive.frame$word, freq = top.positive.frame$magnitude,
max.words=200, random.order=FALSE, rot.per=.2,
colors=brewer.pal(8, "Dark2"), scale=c(4,.5))
```
Feature Selection for words associated with negative Reviews
```{r}
#Feature Selection Negative
uber_driver_text$negative = uber_driver_text$stars == 1 | uber_driver_text$stars == 2 #Creating a row of true or false value to see if the post was related to negative sentiment
corr_negative = cor(uber_driver_text$negative, dtm_matrix_selection)
top20_negative = order(corr_negative, decreasing=T)[1:20]
top20_negative_words = colnames(corr_negative)[top20_negative]
top20_negative_words
```
```{r}
#Building a logistic regression to see which words lead to most negative reviews
logistic_negative = as.data.frame(cbind(negative = uber_driver_text$negative, dtm_matrix_selection[,top20_negative_words]))
pedictive_negative = glm(negative~., data=logistic_negative, family=binomial)
#summary(pedictive_negative)
```
```{r}
#Words most significantly associated with leaving bad cultural review
coef_negative = coef(pedictive_negative)[-1]
negative.terms = coef_negative[coef_negative>0]
top.negative = sort(negative.terms,decreasing=T)[1:20]
top.negative.frame = as.data.frame(top.negative)
top.negative.frame = rownames_to_column(top.negative.frame)
names(top.negative.frame) = c("word", "magnitude")
wordcloud(words = top.negative.frame$word, freq = top.negative.frame$magnitude,
max.words=200, random.order=FALSE, rot.per=.2,
colors=brewer.pal(8, "Dark2"), scale=c(6,.5))
```
Building an LDA model for Uber Reviews
```{r}
#Building an LDA Model
dtm_matrix_lda = as.matrix(dtms_all)
terms = rowSums(dtm_matrix_lda) != 0 #Finding values that never appear
dtm_matrix_lda = dtm_matrix_lda[terms,] #Clearing out values that never appear
ldaOut <-LDA(dtm_matrix_lda, 10, method="Gibbs", control = list(seed = 1234))
terms(ldaOut,10)
```
```{r}
#LDA Model for words associate with positive Revies, that is for reviews with star ratings of 4 or 5
dtm_matrix_lda_positive = as.matrix(dtms_all)
dtm_matrix_lda_positive = dtm_matrix_lda_positive[which(uber_driver_text$positive==TRUE),] #Taking only rows associated with positive sentiment
terms2 = rowSums(dtm_matrix_lda_positive) != 0
dtm_matrix_lda_positive = dtm_matrix_lda_positive[terms2,]
lda_positive <-LDA(dtm_matrix_lda_positive, 9, method="Gibbs", control = list(seed = 1234))
terms(lda_positive,10)
```
```{r}
#LDA Model for words associate with negative Revies, that is for reviews with star ratings of 1 or 2
dtm_matrix_lda_negative = as.matrix(dtms_all)
uber_driver_text$negative = uber_driver_text$stars <= 2 #Creating a row of bad reviews
dtm_matrix_lda_negative = dtm_matrix_lda_negative[which(uber_driver_text$negative==TRUE),] #Taking only the rows with bad reviews
terms3 = rowSums(dtm_matrix_lda_negative) != 0
dtm_matrix_lda_negative = dtm_matrix_lda_negative[terms3,]
lda_negative <-LDA(dtm_matrix_lda_negative,9, method="Gibbs", control = list(seed = 1234))
terms(lda_negative,10)
```
```{r}
#Looking at the words associated with positive reviews using beta analysis
positive_words <- tidy(lda_positive, matrix = "beta")
postive_top_terms <- positive_words %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
postive_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
```
```{r}
#Looking at the words associated with negative reviews using beta analysis
negative_words <- tidy(lda_negative, matrix = "beta")
negative_top_terms <- negative_words %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
negative_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
```
```{r}
#The following code snippet reads the number of reviews on the website for the lyft page
number_lyft = read_html(paste("https://www.indeed.com/cmp/Lyft/reviews?fjobtitle=Driver&start=0")) %>%
html_nodes("b") %>%
html_text()
number_lyft = gsub(pattern = ",", replacement = "", number_lyft) #removing the comma so it can be transformed into a number
number_lyft = as.numeric(as.character(number_lyft))
```
```{r}
#Scraping the tite of the review, the text, the date, and the number of stars given and putting it into a data frame
page = seq(from = 0, to = number_lyft, by = 20) #Deriving the number of pages to be considered
lyft_drivers = NULL #Creating an empty data frame
for (start in (page)) { # For loop to read in the file and append it to a "running total dataframe" called articles}
na = c(NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA,NA,NA, NA)
title = read_html(paste("https://www.indeed.com/cmp/Lyft/reviews?fjobtitle=Driver&start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) .cmp-review-title span:nth-of-type(1)") %>%
html_text()
#print(title)
date = read_html(paste("https://www.indeed.com/cmp/Lyft/reviews?fjobtitle=Driver&start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) span.cmp-review-date-created") %>%
html_text()
#print(date)
review = read_html(paste("https://www.indeed.com/cmp/Lyft/reviews?fjobtitle=Driver&start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) span[itemprop='reviewBody']") %>%
html_text()
#print(review)
stars = read_html(paste("https://www.indeed.com/cmp/Lyft/reviews?fjobtitle=Driver&start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) div.cmp-ratingNumber") %>%
html_text()
stars = gsub(pattern = "", replacement = "", stars)
stars = as.numeric(stars)
#print(stars)
city = read_html(paste("https://www.indeed.com/cmp/Lyft/reviews?fjobtitle=Driver&start=", start, sep = '')) %>%
html_nodes("div[itemprop='review']:nth-of-type(n+2) span.cmp-reviewer-job-location") %>%
html_text()
#The web scraper about 5 times will break if it only finds 19 elements on the page instead of 20. The following set of if statements allows for loop to run even if this occurr by inserting a string of NA's when Web scraper doesn't return the correct number of values
if(length(title == 20)) {
title = title
} else {
title = na
}
if(length(date == 20)) {
date = date
} else {
date = na
}
if(length(review == 20)) {
review = review
} else {
review = na
}
if(length(stars == 20)) {
stars = stars
} else {
stars = na
}
if(length(city == 20)) {
city = city
} else {
title = na
}
temp = data.frame(title, review, date, stars, city)
lyft_drivers = rbind(lyft_drivers, temp)
}
lyft_drivers = na.omit(lyft_drivers)
lyft_drivers$position = "driver"
lyft_drivers$date = str_replace_all(lyft_drivers$date, ",", "")
lyft_drivers$date = as.Date(lyft_drivers$date, format = "%B %d %Y") #Converting it to date format
```
```{r}
lyft_drivers$sentiment = as.numeric(get_sentiment(as.character(lyft_drivers$review))) #Getting sentiment
lyft_drivers2 = lyft_drivers %>%
group_by(date) %>%
summarise(mean(sentiment))
names(lyft_drivers2) = c("date", "sentiment")
lyft_drivers_monthly = lyft_drivers2 %>% #Aggregating data into month bins
group_by(month=floor_date(date, "month")) %>%
summarise(sentiment = mean(sentiment))
lyft_drivers_monthly %>% ggplot(aes(x=month, y=sentiment)) + geom_col(color="pink") + xlab("Sequence") +
ylab("Sentiment") + ggtitle("Lyft Driver Sentiment") +
theme(plot.title = element_text(hjust = 0.5)) + geom_smooth(method = "auto") +xlab("Date")
```
```{r}
# Area plot
uber_drivers_monthly$type=NULL
uber_drivers_monthly$company = "uber"
lyft_drivers_monthly$company = "lyft"
lyft_v_uber = rbind(uber_drivers_monthly, lyft_drivers_monthly)
ggplot(lyft_v_uber, aes(x = month, y = sentiment)) +
geom_area(aes(color = company, fill = company),
alpha = 0.3, position = position_dodge(0.8)) + xlab("Date") + ylab("Sentiment") + ggtitle("Lyft versus Uber", subtitle = "Driver Sentiment over Time")+ scale_x_date(limits = as.Date(c("2014-05-01","2019-05-05")))
```
Lyft Best Cities Old
```{r}
lyft_driver_city = lyft_drivers %>%
group_by(city) %>%
summarise(mean(sentiment))
names(lyft_driver_city) = c("city", "sentiment")
lyft_driver_city =lyft_driver_city %>%
arrange(desc(sentiment))
lyft_driver_city$city2 = gsub(pattern = ",", replacement = "", lyft_driver_city$city) #Taking out the comma to make it readable to google
```
```{r}
lyft_driver_city <- lyft_driver_city %>%
mutate_geocode(city2) #Adding a lat/lon column for each city
```
```{r}
lyft_driver_city2 = lyft_driver_city #Duplicating the dataframe as it takes a long time for google to geocode
latlong2state <- function(pointsDF) { #Convets all the coordinates back to state codes
# Prepare SpatialPolygons object with one SpatialPolygon
# per state (plus DC, minus HI & AK)
states <- map('state', fill=TRUE, col="transparent", plot=FALSE)
IDs <- sapply(strsplit(states$names, ":"), function(x) x[1])
states_sp <- map2SpatialPolygons(states, IDs=IDs,
proj4string=CRS("+proj=longlat +datum=wgs84"))
# Convert pointsDF to a SpatialPoints object
pointsSP <- SpatialPoints(pointsDF,
proj4string=CRS("+proj=longlat +datum=wgs84"))
# Use 'over' to get _indices_ of the Polygons object containing each point
indices <- over(pointsSP, states_sp)
# Return the state names of the Polygons object containing each point
stateNames <- sapply(states_sp@polygons, function(x) x@ID)
stateNames[indices]
}
lyft_driver_city3 = lyft_driver_city2
lyft_driver_city3 = na.omit(lyft_driver_city3) #Removing NA values
lyft_driver_city3$state = latlong2state(lyft_driver_city3[4:5]) #Adding a column of state codes
lyft_driver_city3 = na.omit(lyft_driver_city3)#Removing NA values
states = read.csv("statelatlong.csv") #A data fame with states and state codes
states = states[, c(-2,-3)]
states$City = tolower(states$City) #converting state names to lowercase
lyft_driver_city4 = left_join(lyft_driver_city3, states, by = c("state" ="City")) #AAssociating each state with a code to be read into plotly
```
Lyft Driver Best Cities
```{r}
lyft_driver_city10 = lyft_driver_city2
lyft_driver_city10$coordinates = paste(lyft_driver_city10$lat, lyft_driver_city10$lon, sep = ",") #Pasting the coordinates together
lyft_driver_city10$cityname = geocode(lyft_driver_city10$coordinates, output = "more")
lyft_driver_city10 = na.omit(lyft_driver_city10)
lyft_driver_city10
lyft_driver_city10=lyft_driver_city10[-c(20, 144),]
#The following code pulls out the city name only because the data was very messy
pat="(,.\\w+,)|(,.\\w+.\\w+,)"
lyft_driver_city10$realcityname = gsub("(,\\s)|,","",regmatches(m<-strsplit(lyft_driver_city10$cityname[[5]],"\\|"),regexpr(pat,m)))
lyft_driver_city10$realcityname = gsub("(,\\s)|,","",regmatches(m<-strsplit(lyft_driver_city10$cityname[[5]],"\\|"),regexpr(pat,m)))
lyft_driver_city11 = lyft_driver_city10 %>%
group_by(realcityname) %>%
summarise(mean(sentiment))
names(lyft_driver_city11) = c("city", "sentiment")
lyft_driver_city11 = lyft_driver_city11 %>%
arrange(desc(sentiment))
lyft_driver_city11$city <- factor(lyft_driver_city11$city, levels = lyft_driver_city11$city[order(lyft_driver_city11$sentiment)])
lyft_best_cities2 = ggplot(lyft_driver_city11[1:20,], aes(x=city, y=sentiment)) +
geom_bar(stat="identity", width=.5, fill="hotpink") +
labs(title="Lyft Average Sentiment by City") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
lyft_best_cities2
```
```{r}
lyft_driver_city5 = lyft_driver_city4 %>% #Creating a data frame grrouping by state and finding the mean sentiment for drivers
group_by(State) %>%
summarise(mean(sentiment))
names(lyft_driver_city5) = c("State", "Sentiment")
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
lyft_map <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('lightblue'))
lyft_state_map <- plot_geo(lyft_driver_city5, locationmode = 'USA-states') %>%
add_trace(
z = ~Sentiment, locations = ~State,
color = ~Sentiment, colors = 'Reds'
) %>%
colorbar(title = "Sentiment Level") %>%
layout(
title = 'Historical lyft Driver Sentiment by State
(Hover for breakdown)',
geo = map
)
lyft_state_map
#chart_link_lyft = api_create(lyft_state_map, filename="lyft_choropleth")
#chart_link_lyft #Exporting the Lyft state map to be embedded
```
Performing a two sample t-test to see if sentiment amongst uber drivers was better than that amongsth lyft drivers
```{r}
t.test(uber_drivers$sentiment, lyft_drivers$sentiment, paired = FALSE, alternative = "two.sided")
```
Perfirming a t-test between uber drivers before and after the CEO switch
```{r}
t.test(uber_drivers_before$sentiment, uber_drivers_after$sentiment, paired = FALSE, alternative = "two.sided")
```
Perfirming a t-test between uber **Employees before and after the CEO switch
```{r}
t.test(uber_employees_before$sentiment, uber_employees_after$sentiment, paired = FALSE, alternative = "two.sided")
```
```{r}
t.test(uber_drivers$sentiment, uber_employees$sentiment, paired = FALSE, alternative = "two.sided")
```
Looking to see if there is a correlation betwee
```{r}
minimum_wage = read.csv("Minimum Wage Data.csv")
minimum_wage$State = tolower(minimum_wage$State)
minimum_wage_avg = minimum_wage %>%
filter(Year == 2012 | Year == 2013 |Year == 2014 |Year == 2015 |Year == 2016 |Year == 2017 |Year == 2018 |Year == 2019) %>%
group_by(State) %>%
summarise(mean(High.Value))
minimum_wage_avg = left_join(minimum_wage_avg, states, by = c("State" ="City"))
minimum_wage_avg = na.omit(minimum_wage_avg)
collective = left_join(uber_driver_city5, minimum_wage_avg, by = c("State" ="State.y"))
names(collective) = c("Code", "Sentiment", "State", "MinWage")
sentiment_wage_correlation = cor(collective$Sentiment, collective$MinWage) #Looking at the correlation between sentiment and minimum wage. A negative correlation is expected, because the higher the minimum wage, the more discontent the driver would be by riding with uber
sentiment_wage_correlation #Looking at the correlation of state minimum wage and sentiment
```
Creating a Scatterplot
```{r}
ggscatter(collective, x = "MinWage", y = "Sentiment",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Minimum Wage", ylab = "Sentiment")
```
The following block of code looked to see if variables by in census data were able to predict sentiment by state. Unfortunately, I couldn't build the model because the census metrics weren't as relevant as I thought
```{r}
census = read.csv("acs2017_county_data.csv")
census_state = census %>% #Collapsing by state and computing the average of several potentially important metrics by state
group_by(State) %>%
summarise(median(Income), mean(IncomePerCap), mean(Poverty), mean(Professional), mean(Service), mean(Office), mean(Office), mean(Construction), mean(Production), mean(Drive), mean(Carpool), mean(Transit), mean(Walk), mean(WorkAtHome), mean(MeanCommute), mean(Employed), mean(Unemployment))
census_state$State = tolower(census_state$State) #Making state names lowercase
census_state = left_join(census_state, states, by = c("State" ="City")) #Joining with state code
census_state = na.omit(census_state) #Removing NA values
names(census_state)[18] = c("State_Code")
census_uber_state = left_join(uber_driver_city5, census_state, by = c("State" ="State_Code")) #Merging uber driver sentiment and other census metrics
census_uber_state = census_uber_state[,-3] #Removing the actual state names
getwd()
write.csv(census_uber_state, file = "uber.csv",row.names=TRUE)
```
```{r}
all_states = NULL
inner_join(uber_driver_city5, lyft_driver_city5, by = "State")
sentiment_correlation = NULL
cor(all_states$Sentiment.x, all_states$Sentiment.y)
sentiment_correlation
```