final_DA.Rmd

```{r}
# path
path = "D:\\Stock-Market-Analysis\\"
knitr::opts_knit$set(root.dir = path) 
setwd(path)
```

```{r}
library(dplyr)
library(ggplot2)
library(forecast)
library(plotly)
library(keras)
```

```{r}
#read all the data from csv file
#DO NOT EXECUTE THIS CHUNK OF CODE IF WORKSPACE ENVIRONMENT WAS LOADED
data <- read.csv("complete.csv")
```

```{r}
#find the fields present
head(data)
tail(data)
```

```{r}
#convert the date column into suitable format
data$date <- as.Date(as.character(data$date), format = "%Y%m%d")
```

```{r}
#get the names of all the companies
companies <- unique(data$name)
```

```{r}
#remove all companies with less than 5 years of data
c <- c()
for(company in companies) {
  d <- data %>% filter(name == company)
  if(nrow(d) > 1000) {
  c <- c(c, company)
  }
}
companies <- c
```

```{r}
#find out all the companies which are currently being traded as of 2019
comp <- c()
for(company in companies) {
  d <- data %>% filter(name == company)
  latest <- max(d$date)
  y <- as.numeric(format(latest,'%Y'))
  if(y == 2019) {
    comp <- c(comp, company)
  }
}
companies <- comp
```

```{r}
#to find whether the companies which are currently being traded have data for all the years from the day that they were listed on the exchange
#if not, then drop the companies 

comp_with_all_data <- c()
for(company in companies) {
  d <- data %>% filter(name == company)
  latest <- max(d$date)
  earliest <- min(d$date)
  
  year_latest <- as.numeric(format(latest,'%Y'))
  year_earliest <- as.numeric(format(earliest, '%Y'))
  
  years <- seq(year_earliest, year_latest)
  y <- format(d['date'], "%Y")
  
  flag <- 1
  for(val in years) {
       c <- sum(y == val)
       if(c == 0) {
         flag <- 0
       }
  }
  if(flag == 1) {
    comp_with_all_data <- c(comp_with_all_data, company)
  }
}
companies <- comp_with_all_data
```

```{r}
#filter the data to keep only those rows which pertain to the companies in the vector companies 
data <- data %>% filter(name %in% companies)
```

```{r}
#get the splits information
splits <- read.csv("splits.csv")

#drop all the rows which don't have a code 
splits <- na.omit(splits)
splits$date <- as.Date(as.character(splits$date), format = "%d-%m-%Y")

#filter out the splits records to take into account only those companies which we have from the previous step 
splits <- splits %>% filter(code %in% companies)
```

```{r}
#get the bonuses information
bonuses <- read.csv("combinedbonuses.csv")
names(bonuses) <- c("name", "code", "ratio", "ann", "date", "ex")

#convert all the dates into suitable R format
bonuses$ann <- as.Date(as.character(bonuses$ann), format = "%d-%m-%Y")
bonuses$date <- as.Date(as.character(bonuses$date), format = "%d-%m-%Y")
bonuses$ex <- as.Date(as.character(bonuses$ex), format = "%d-%m-%Y")
bonuses <-  bonuses[!(is.na(bonuses$code) | bonuses$code==""), ]

#omit all the rows which do not have any values
bonuses <- na.omit(bonuses)

#take only the rows which are in the companies vector which we had filtered before  
bonuses <- bonuses %>% filter(code %in% companies)
```

```{r}
#calculate the multiplier for each row in bonuses that has to be multiplied with the price in the dataset to give the adjusted price
vect <- c()
bonuses$ratio <- as.character(bonuses$ratio)
for(i in 1:nrow(bonuses)) {
  trial <- bonuses[i,]['ratio']
  numbers <- strsplit(trial[[1]], "[:]")
  vect <- c(vect, (as.integer(numbers[[1]][1]) + as.integer(numbers[[1]][2]))/(as.integer(numbers[[1]][2])))
}
bonuses$mult <- vect
```

```{r}
#ratios contains the multiplier for each company using splits data 
#calculate the multiplier for each company using the splits data: the value of multiplier is that which has to be multiplied with the latest price in the dataset to give the adjusted price

ratios <- c()
for(company in companies) {
  values <- splits %>% filter(code == company)
  dates_splits <- values %>% select('date')
  ratio <- 1 
  
  if(nrow(values) > 0) {
    for(row in 1:nrow(values)) {
        ratio <- ratio * values$old_fv[row]/values$new_fv[row]
    }
  }
  ratios <- c(ratios, ratio)
}
```

```{r}
#ratios1 contains the value of multiplier for each company using bonus data 
#calculate the multiplier for each company using the bonus data: the value of multiplier is that which has to be multiplied with the latest price in the dataset to give the adjusted price

ratios1 <- c()
for(company in companies) {
  bonus <- bonuses %>% filter(code == company)
  ratio <- 1
  
  if(nrow(bonus) > 0) {
    for(row in 1:nrow(bonus)) {
        ratio <- ratio * bonus$mult[row]
    }
  }
  ratios1 <- c(ratios1, ratio)
}
```

```{r}
#adjust the ratio by multiplying the corresponding bonus and split for each company
ratios_final <- c()
for(i in 1:length(ratios)) {
  ratios_final <- c(ratios_final, ratios[i] * ratios1[i])
}
```

```{r}
#calculation of CAGR for each company 
#multiply latest price with ratio before calculating cAGR
cagr <- c()
i <- 1
for(company in companies) {
    d <- data %>% filter(name == company)
    latest <- max(d$date)
    earliest <- min(d$date)
    latest_row <- d %>% filter(date == latest)
    earliest_row <- d %>% filter(date == earliest)
    y_e <- as.numeric(format(earliest,'%Y'))
    y_l <- as.numeric(format(latest, '%Y'))
    cagr <- c(cagr, (((latest_row$close * ratios_final[i])/earliest_row$close)^(1/(y_l - y_e)) - 1) * 100)
    i <- i + 1
}
```

```{r}
#construct a data frame with company names and cagr
filler <- data.frame("company"=companies, "cagr"=cagr)

#filter out the companies which give more than 15% CAGR
final_companies <- filler %>% filter(cagr >= 15)
```

```{r}
#filter the main dataset to contain only the rows of the companies which give greater than 15% CAGR
data <- data %>% filter(name %in% final_companies$company)
#take only those splits into account of the companies which are present in the new dataframe 'data'
splits <- splits %>% filter(code %in% final_companies$company)
#take only those bonuses into account of the companies which are present in the new dataframe 'data'
bonuses <- bonuses %>% filter(code %in% final_companies$company)
```

```{r}
#find a value which gives an insight into the returns received over the years
final <- c()
pos_years <- c()
neg_years <- c()

for(company in final_companies$company) {
  d <- data %>% filter(name == company)
  
  #find the last date of record of the company in the dataset
  latest <- max(d$date)
  #find the first date of record of the company in the dataset
  earliest <- min(d$date)
  
  #find the year from the date obtained in latest
  year_latest <- as.numeric(format(latest,'%Y'))
  #find the year from the date obtained in earliest
  year_earliest <- as.numeric(format(earliest, '%Y'))
  
  #create a sequence of all the years from year_earliest to year_latest, in order to calculate the returns for each     year 
  years <- seq(year_earliest, year_latest)  
  values <- c()
  for(year in years) {
    #filter out all the data of that particular company which is in that year
    subset <- d %>% filter(as.numeric(format(date, '%Y')) == year)
    ratio <- 1
    last_date <- max(subset$date)
    early_date <- min(subset$date)
    
    split <- splits %>% filter(code == company, as.numeric(format(date, '%Y')) == year)
    bonus <- bonuses %>% filter(code == company, as.numeric(format(date, '%Y')) == year)
    
    first_row <- subset %>% filter(date == early_date) 
    last_row <- subset %>% filter(date == last_date)
    
    #apply the correct values of split and bonus for the particular year, so that we know what to multiply by to get      the accurate value for that year
    if(nrow(split) > 0) {
      for(i in 1:nrow(split)) {
        ratio <- ratio * split[i,]$old_fv/split[i,]$new_fv
      }
    }
    if(nrow(bonus) > 0) {
      for(i in 1:nrow(bonus)) {
        ratio <- ratio * bonus[i,]$mult
      }
    }
    ret <- (last_row$close * ratio - first_row$close)/(first_row$close) * 100
    values <- c(values, ret)
  }
  
  #find number of positive years 
  pos_y <- length(values[values > 0])
  neg_y <- length(values[values <= 0])
  pos_years <- c(pos_years, pos_y)
  neg_years <- c(neg_years, neg_y)
  
  #caluclate one value for the returns for that year 
  #the older the 'returns' data, the lesser the value it is given
  #value given to each year is the reciprocal of how many years have passed since that day to today - older the data,   lesser the impact it will have on how the company performs as of today
  
  sum <- 0
  div_by <- length(years) 
  for(i in values) {
    sum <- sum + (i/div_by)
    div_by <- div_by - 1
  }
  final<- c(final, sum)
}
```

```{r}
#add to the final dataset
final_companies$w_returns <- final
final_companies$pos <- pos_years
final_companies$neg <- neg_years
```

```{r}
#filter the companies which have more negative years than positive years 
final_companies <- final_companies %>% filter(pos/(pos + neg) >= 0.75)
```

```{r}
#score the companies by taking equal weightage of both cagr and weighted returns - 50% each - want a consolidated value which shows the result of both
score <- c()
for(i in 1:nrow(final_companies)) {
  score <- c(score, final_companies[i,]$cagr * 0.5 + final_companies[i,]$w_returns * 0.5)
}
final_companies$score <- score
final_order <- final_companies[order(score),]
```

```{r}
#pick the top 250 companies on the basis of the above ranking scheme 
stocks <- tail(final_order, 250)
stocks <- stocks %>% filter(company != 'CRMFGETF')
```

```{r}
#these companies that have been filtered out, might be because of some high return years ago, which gives CAGR a high value or some high return in the recent years - to find if the companies are actually in uptrend as of today - we find three moving averages - 50 day MA, 100 day MA, 200 day MA - and see whether they follow this pattern - 50 days > 100 days > 200 days 
#if the above pattern is followed - we give the company a 1, otherwise the 0

data <- data %>% filter(name %in% stocks$company)
one_or_zero <- c()
for(company in stocks$company) {
  d <- data %>% filter(name == company)
  d <- d[order(d$date),]
  last_50_rows <- tail(d, 50)
  last_100_rows <- tail(d, 100)
  last_200_rows <- tail(d, 200)
  for_50 <- sum(last_50_rows$close)/50
  for_100 <- sum(last_100_rows$close)/100
  for_200 <- sum(last_200_rows$close)/200
  if(for_50 > for_100) {
    if(for_100 > for_200) {
      one_or_zero <- c(one_or_zero, 1)
    }
    else {
      one_or_zero <- c(one_or_zero, 0)
    }
  }
  else {
    one_or_zero <- c(one_or_zero, 0)
  }
}
```

```{r}
stocks$flag <- one_or_zero
stocks <- stocks %>% filter(flag == 1)
```

```{r}
#Storing top companies in a vector 
comps = subset(stocks, select = c(company))
comps = comps$company

#company = "ABBOTINDIA"
#for every company, we are forecasting close price 30 days later from present day.

for(company in comps)
{
  marico = subset(data, name == company, select = c(name, date, close))
  
  test_data = read.csv("test.csv")
  test_data = subset(test_data, name == company, select = c(name, date, close))
  test_data$date <- as.Date(as.character(test_data$date), format = "%Y%m%d")
  
  test_data$date <- as.Date(as.character(test_data$date), format = "%Y-%m-%d")
  
  
  marico$date <- as.Date(as.character(marico$date), format = "%Y-%m-%d")
  
  marico = subset(marico, date > as.Date("2018-12-31"), select = c(name, date,close)) #only selecting data from 2019-01-01
  
  ggplot(data = marico, aes(date, close)) + geom_line() 
    
  fit <- auto.arima(marico$close) #fit the model to the given data
  summary(fit)
  
  #plot(forecast(fit,30), col = "blue") #plot the forecast per company
  #line(test_data$date, test_data$close, col="green")
  marico_predicted <- marico$close
  forecasted_values = forecast(fit,33)
  observed = seq(1,220)
  marico_predicted <- c(marico_predicted, test_data$close)

  plot(forecasted_values, main=company, xlab = "Day number", ylab = "Close price", col = "blue", lwd=2)
  lines(observed,marico_predicted, col = "red")
  legend(5, 2500, legend=c("Forecasted", "Actual"),
       col=c("blue", "red"), lty=1:1, cex=0.8)
  
}

```

```{r}

comps = subset(stocks, select = c(company))
comps = comps$company

#company = "ABBOTINDIA"
for(company in comps)
{
  marico = subset(data, name == company, select = c(name, date, close))
  
  marico$date <- as.Date(as.character(marico$date), format = "%Y-%m-%d")
  
  marico = subset(marico, date > as.Date("1996-01-01"), select = c(name, date,close)) 
  #following is to normalize the data.
  msd.price = c(mean(marico$close), sd(marico$close)) #mean and std. deviation
  
  marico$price = (marico$close - msd.price[1])/msd.price[2]
  
  #summary(marico$price)
  
  datalags = 10
  rows = nrow(marico)
  
  #required condition while splitting into train and test batches : 
  #1-batch size should divide the number of rows in training data and number of rows in testing data
  #2-number of rows in testing data should divide number of rows in training data
  
  #The following if-else clauses help take care of the above conditions if either too much data is available, or mediocre data is available
  if(rows >= 3000)
  {
     marico = tail(marico,3000) #taking latest 3000 days data. 
     n_train = 2000
     x=2000
     n_test = 1000
  }else if(rows <3000 & rows > 2000)
  {
     marico = tail(marico,2000) #taking latest 3000 days data. 
     n_train = 1500
     n_test = 500
     x=1500
  }else if(rows < 2000 & rows > 1000)
  {
     marico = tail(marico,1000)
     n_train = 750
     n_test = 250
     x = 750
  }
  
  #splitting data into training and testing with batch size 50
  train = marico[seq(n_train + datalags), ] #2000
  test = marico[n_train + datalags + seq(n_test + datalags), ]
  batch.size = 50
  
  x.train = array(data = lag(cbind(train$price), datalags)[-(1:datalags), ], dim = c(nrow(train) - datalags, datalags, 2))
  y.train = array(data = train$price[-(1:datalags)], dim = c(nrow(train)-datalags, 1))
  
  x.test = array(data = lag(cbind(test$price), datalags)[-(1:datalags), ], dim = c(nrow(test) - datalags, datalags, 2))
  
  y.test = array(data = test$price[-(1:datalags)], dim = c(nrow(test) - datalags, 1))
  
  model <- keras_model_sequential()
  
  #initializing model parameters
  model %>%
  layer_lstm(units = 100,
  input_shape = c(datalags, 2),
  batch_size = batch.size,
  return_sequences = TRUE,
  stateful = TRUE) %>%
  layer_dropout(rate = 0.5) %>%
  layer_lstm(units = 50,
  return_sequences = FALSE,
  stateful = TRUE) %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = 1)
  
  model %>%
  compile(loss = 'mae', optimizer = 'adam')
  
  model
  
  #training the data
  for(i in 1:10)
  {
    model %>% fit(x = x.train,
    y = y.train,
    batch_size = batch.size,
    epochs = 1,
    verbose = 0,
    shuffle = FALSE)
    model %>% reset_states()
  }
  
  #storing the predicted values
  pred_out <- model %>% predict(x.test, batch_size = batch.size) %>% .[,1]
  
  #plot lstm predictions

  print(plot_ly(marico, x = ~date, y = ~price, type = "scatter", mode = "markers", name = "Observed values") %>%
    add_trace(y = c(rep(NA, x), pred_out), x = marico$date, name = "LSTM prediction", mode = "lines") %>%
      layout (title = company))
  
}

```


```{r}
#find the amount of profit one would have made if they bought 1 stock of each company from the subset generated by the training dataset vs the amount of profit one would have made if they bought 1 stock of each company with drift as given by ARIMA analysis from the subset generated by the training dataset in October
companies1 <- c()

test_data = read.csv("test.csv")
for(company in comps) {
  companies1 <- c(companies1, as.character(company))
}
cost <- 0
sum <- 0
for(company in companies1) {
  x <- data %>% filter(name == company)
  y <- x %>% filter(name == company, date == max(x$date))
  cost <- cost + y[1,]$close
}
for(company in companies1) {
  x <- test_data %>% filter(name == company)
  d <- max(x$date)
  y <- x %>% filter(date == d)
  sum <- sum + (y[1,]$close)
}
cat("Profit percentage according to scores based on indicators: ")
cat((sum - cost)/cost * 100)
cat('\n')


test_data = read.csv("test.csv")

#on analyzing the good ARIMA plots, the companies we filtered out
com <- c('N100', 'PIIND', 'ABBOTINDIA')

cost <- 0
sum <- 0
for(company in com) {
  x <- data %>% filter(name == company)
  max_date <- max(x$date)
  y <- x %>% filter(name == company, date == max_date) 
  cost <- cost + y[1,]$close
}

for(company in com) {
  x <- test_data %>% filter(name == company)
  d <- max(x$date)
  y <- x %>% filter(date == d)
  sum <- sum + (y[1,]$close)
}
cat("Profit percentage according to ARIMA: ")
cat((sum - cost)/cost * 100)
```