# BUAN 6356 Group Project # Group Members: Akash Navneeth, Akshat Gadiya, & Rohith Thadela # Project: Analyzing Attrition at IBM # Importing the attrition_data.csv file getwd() setwd("C:/Users/akash/OneDrive/Desktop/UTD/Spring 2023 Classes/(5) BUAN 6356/Group Project") attritiondata <- read.csv("attrition_data.csv", header=TRUE, stringsAsFactors = FALSE) # Converting certain variables to factor type attritiondata$Education <- as.factor(attritiondata$Education) attritiondata$EnvironmentSatisfaction <- as.factor(attritiondata$EnvironmentSatisfaction) attritiondata$JobSatisfaction <- as.factor(attritiondata$JobSatisfaction) attritiondata$WorkLifeBalance <- as.factor(attritiondata$WorkLifeBalance) # Loading the necessary libraries for the analysis library(ggplot2) # Insight 1: Do employees that travel a lot compared to other employees more likely to leave the company? # Creating a histogram to display the data to answer this insight. g1 <- ggplot(attritiondata, aes(x=DistanceFromHome, fill=Attrition)) + geom_histogram(color='black') g1 <- g1 + labs(x='Distance From Home', y='Number of Employees', title='Does Distance From Home Impact Attrition?') g1 <- g1 + facet_grid(vars(Attrition)) g1 # Insight 2: Are less educated people in the company more likely to leave the company? # Creating a bar plot to display the data to answer this insight. g2 <- ggplot(attritiondata, aes(x=Education, fill=Attrition)) + geom_bar(color='black', stat="count") g2 <- g2 + labs(x='Level of Education', 'Number of Employees', title='Does Level of Education Impact Attrition?') g2 <- g2 + facet_grid(vars(Attrition)) g2 <- g2 + scale_x_discrete(labels=c('Below College', 'College', 'Bachelors', 'Masters', 'Doctorate')) g2 # Insight 3: Is there a specific department that seems to have a higher amount of people who decide to leave the company? # Creating multiple pie charts to display the data to answer this insight. g3 <- ggplot(attritiondata, aes(x=factor(1), stat='bin', fill=Attrition)) + geom_bar(position='fill') g3 <- g3 + facet_grid(facets=. ~ Department) + coord_polar(theta="y") g3 <- g3 + labs(x='', y='Department', title='Rate of Attrition By Department') g3 # Insight 4: Do employees that have worked at a lot of companies prior to IBM have a higher chance of leaving the company? # Creating a side-by-side bar chart to display the answer to this insight. g4 <- ggplot(attritiondata, aes(x=NumCompaniesWorked, fill=Attrition)) g4 <- g4 + geom_bar(color='black', stat="count", position=position_dodge()) g4 <- g4 + scale_x_continuous(breaks=c(0,1,2,3,4,5,6,7,8,9)) g4 <- g4 + labs(x='Number of Companies Previously Worked', y='Number of Employees', title='Does Number of Companies Previously Worked Influence Attrition?') g4 # Insight 5: Do employees that have worked for a long time with IBM have a higher chance of staying with the company? # Creating a side-by-side bar chart to display the answer to this insight. g5 <- ggplot(attritiondata, aes(x=YearsAtCompany, fill=Attrition)) g5 <- g5 + geom_bar(color='black', stat="count", position=position_dodge()) g5 <- g5 + labs(x='Tenure at IBM', y='Number of Employees', title='Does Tenure Impact Attrition?') g5 # Insight 6: Which of the following factors has more of an impact on whether an employee might leave the company # (and contribute to the attrition rate): job satisfaction, or environment satisfaction # or work-life balance? # Creating a faceted pie chart to analyze Level of JOB SATISFACTION. g6a <- ggplot(attritiondata, aes(x=factor(1), stat='bin', fill=Attrition)) + geom_bar(position='fill') g6a <- g6a + facet_grid(facets=. ~ JobSatisfaction) + coord_polar(theta="y") g6a <- g6a + labs(x='', y='Level of Job Satisfaction', title='Rate of Attrition By Level of Job Satisfaction') g6a # Creating a faceted pie chart to analyze Level of WORK-LIFE BALANCE. g6b <- ggplot(attritiondata, aes(x=factor(1), stat='bin', fill=Attrition)) + geom_bar(position='fill') g6b <- g6b + facet_grid(facets=. ~ WorkLifeBalance) + coord_polar(theta="y") g6b <- g6b + labs(x='', y='Level of Work-Life Balance', title='Rate of Attrition By Level of Work-Life Balance') g6b # Creating a faceted pie chart to analyze Level of ENVIRONMENT SATISFACTION. g6c <- ggplot(attritiondata, aes(x=factor(1), stat='bin', fill=Attrition)) + geom_bar(position='fill') g6c <- g6c + facet_grid(facets=. ~ EnvironmentSatisfaction) + coord_polar(theta="y") g6c <- g6c + labs(x='', y='Level of Environment Satisfaction', title='Rate of Attrition By Level of Environment Satisfaction') g6c # Insight 7: Is there a particular trend between age and monthly income received at IBM in relation to attrition? # Creating a scatterplot with a trendline. g7 <- ggplot(attritiondata, aes(x=Age, y=MonthlyIncome, color=Attrition)) g7 <- g7 + geom_point(stat="identity") + geom_smooth(method="loess") g7 <- g7 + labs(y='Monthly Income', title='Is there a Correlation Between Age & Montly Income with Attrition?') g7 # Building a Classification Model set.seed(123) index_train <- sample(1:nrow(attritiondata), 2 / 3 * nrow(attritiondata)) attritiondata.train <- attritiondata[index_train, ] attritiondata.test <- attritiondata[-index_train, ] # install.packages("rpart") # install.packages("rpart.plot") library(rpart) library(rpart.plot) fit <- rpart(Attrition ~ ., data = attritiondata.train, method = "class", control = rpart.control(xval = 10, minsplit = 50), parms = list(split = "gini")) rpart.plot(fit, type = 1, extra = 1, main = "Classification Tree") attritiondata.pred <- predict(fit, attritiondata.test, type="class") attritiondata.actual <- attritiondata.test$Attrition cm <- table(attritiondata.pred, attritiondata.actual) tp <- cm[2,2] tn <- cm[1,1] fp <- cm[2,1] fn <- cm[1,2] # Calculation for False Positive Rate (FPR) attritiondata.test.fpr <- (fp / (fp + tn)) # Calculation for False Negative Rate (FNR) attritiondata.test.fnr <- (fn / (fn + tp)) # Calculation for Specificity attritiondata.test.spec <- (tn / (tn + fp)) # Calculation for Sensitivity attritiondata.test.sens <- (tp / (tp + fn)) # Calculation for Accuracy attritiondata.test.acc <- (tp + tn) / (tp + tn + fp + fn)