/
merge_data.py
92 lines (59 loc) · 2.21 KB
/
merge_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import sys
import pandas as pd
# get a list of stocks
demo_folder_for_stock_list = r"C:\Users\as250199\nifty50\oneminutedata\2017\APR\NIFTY50_APR2017\NIFTY50_APR2017"
company = []
for files in os.listdir(demo_folder_for_stock_list):
if files.endswith(".txt"):
stock_name = files[:-4]
company.append(stock_name)
# now for each of the company , go in all folders
# search for the file with that name and if present add it to its'
# dataframe, then save a CSV file.
# start_folder = r"C:\Users\as250199\nifty50\oneminutedata\2017"
# columns = ["STOCK","DATE","TIME", "OPEN","HIGH", "LOW","CLOSE", "VOLUME" ]
# now do a recursive search of folders and
root_folder = r"C:\Users\as250199\nifty50\oneminutedata\2018"
count = 0
part = 0
full_dataset = pd.DataFrame([], columns = columns)
for root, dirs, files in os.walk(root_folder):
# print(files)
count += 1
if not root[49:].startswith("NIFTY50") :
continue
# print(dirs)
print(root[49:])
for _file in files:
if _file.endswith(".txt"):
# add it to dataframe
full_path = os.path.join(root_folder, root, _file)
print(full_path)
df = pd.read_csv(full_path, names=columns, header = None)
count += len(df)
if (len(full_dataset) == 0):
full_dataset = df
else:
full_dataset = pd.concat([full_dataset, df])
# print(full_dataset.head())
if count > 100000:
# save the iteration
part += 1
full_dataset.to_csv("full_dataset_2017_part_{}.csv".format(str(part)), index=False)
full_dataset = pd.DataFrame([], columns = columns)
count = 0
# after everything save the CSV file
# now merge all the part datasets
allfiles = []
full_dataset = pd.DataFrame()
for files in os.listdir("."):
# print(files)
if files.find("part") > 0:
print(files)
df = pd.read_csv(os.path.join(os.getcwd(), files))
if(len(full_dataset) == 0):
full_dataset = df
else:
full_dataset = pd.concat([full_dataset,df])
full_dataset.to_csv("all_dataset_2017.csv", index=False)