Welcome to the Simplilearn Community

Want to join the rest of our members? Sign up right away!

Sign Up

Deepti - Data Science with R - Nov 28 - Jan 17

Apart from Statistics any other math concepts we need to brush up for now ? seems like data science is pretty much statistics and machine learning specific tasks need Multivariable Calculus and Linear Algebra any ideas ? also any to the point material ?
 
hi everyone,
please suggest!! Is it ok coding of Walmart project??




data1 <- read.csv("Walmart_Store_sales.csv")
View(data1)
#dividing data into training and validation
library(caTools) #important package
# sample the input data with 70% for training and 30% for testing
sample <- sample.split(data1$Weekly_Sales,SplitRatio=0.70)
sample

train_data <- subset(data1,sample==TRUE) #split of the data using subset command
test_data <- subset(data1,sample==FALSE)
#model building
# ,Will get the independent var

model <- lm(Weekly_Sales ~ ., data = train_data )

summary(model)

#FILTER THE SIGNIFICANT

#rerun the model by using only significant var : consider this as final model

model1 <- lm(Weekly_Sales ~Fuel_Price+CPI+Unemployment,data = train_data)
summary(model1)

#prediction on testing data set
predtest<- predict(model1,test_data)
predtest

# atach it with the dataframe
pred1<- data.frame(predtest)
#plotting actual versus predicted values
#plotting in graphs:we can see red nad blue lines are overlapping which shows best fit model
plot (test_data$Weekly_Sales,col="red",type ="l" )
lines(pred1,col="blue",type ="l")
#to bind the predicted data set with original data set by cbind function

final_data = cbind(test_data, pred1)
sqrt(mean((final_data$Weekly_Sales - final_data$predtest)^2))
write.csv(final_data, "linear_out.csv")
 

dhanya_13

New Member
The R-Squared for the walmart data is very low- 18%... I tried with simply loading the data as is and creating the model without converting any data. I also tried by changing the date to date format and holiday_flag to factor. In both cases sign vars are different but the Rsquared is very low for both. Did anyone get a 70%+ R-squared?
 
hi everyone,
please suggest!! Is it ok coding of Walmart project??




data1 <- read.csv("Walmart_Store_sales.csv")
View(data1)
#dividing data into training and validation
library(caTools) #important package
# sample the input data with 70% for training and 30% for testing
sample <- sample.split(data1$Weekly_Sales,SplitRatio=0.70)
sample

train_data <- subset(data1,sample==TRUE) #split of the data using subset command
test_data <- subset(data1,sample==FALSE)
#model building
# ,Will get the independent var

model <- lm(Weekly_Sales ~ ., data = train_data )

summary(model)

#FILTER THE SIGNIFICANT

#rerun the model by using only significant var : consider this as final model

model1 <- lm(Weekly_Sales ~Fuel_Price+CPI+Unemployment,data = train_data)
summary(model1)

#prediction on testing data set
predtest<- predict(model1,test_data)
predtest

# atach it with the dataframe
pred1<- data.frame(predtest)
#plotting actual versus predicted values
#plotting in graphs:we can see red nad blue lines are overlapping which shows best fit model
plot (test_data$Weekly_Sales,col="red",type ="l" )
lines(pred1,col="blue",type ="l")
#to bind the predicted data set with original data set by cbind function

final_data = cbind(test_data, pred1)
sqrt(mean((final_data$Weekly_Sales - final_data$predtest)^2))
write.csv(final_data, "linear_out.csv")
-----------------------------------------------------------------------------------------------------
Mine is almost same process except finding the Max sales...

# Finding which store has maximum sales
max_sales<- data1[order(data1$Weekly_Sales,decreasing = T),]
head (max_sales) # Top six performers
#14th store has max sales.
 
hi everyone,
please suggest!! Is it ok coding of Walmart project??




data1 <- read.csv("Walmart_Store_sales.csv")
View(data1)
#dividing data into training and validation
library(caTools) #important package
# sample the input data with 70% for training and 30% for testing
sample <- sample.split(data1$Weekly_Sales,SplitRatio=0.70)
sample

train_data <- subset(data1,sample==TRUE) #split of the data using subset command
test_data <- subset(data1,sample==FALSE)
#model building
# ,Will get the independent var

model <- lm(Weekly_Sales ~ ., data = train_data )

summary(model)

#FILTER THE SIGNIFICANT

#rerun the model by using only significant var : consider this as final model

model1 <- lm(Weekly_Sales ~Fuel_Price+CPI+Unemployment,data = train_data)
summary(model1)

#prediction on testing data set
predtest<- predict(model1,test_data)
predtest

# atach it with the dataframe
pred1<- data.frame(predtest)
#plotting actual versus predicted values
#plotting in graphs:we can see red nad blue lines are overlapping which shows best fit model
plot (test_data$Weekly_Sales,col="red",type ="l" )
lines(pred1,col="blue",type ="l")
#to bind the predicted data set with original data set by cbind function

final_data = cbind(test_data, pred1)
sqrt(mean((final_data$Weekly_Sales - final_data$predtest)^2))
write.csv(final_data, "linear_out.csv")




Guys please help i am not in programming background.Is it OK above coding???
 
Last edited:
-----------------------------------------------------------------------------------------------------
Mine is almost same process except finding the Max sales...

# Finding which store has maximum sales
max_sales<- data1[order(data1$Weekly_Sales,decreasing = T),]
head (max_sales) # Top six performers
#14th store has max sales.
yes...finally i submitted my project.
 
Top