-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsave_data.py
More file actions
46 lines (34 loc) · 1.62 KB
/
save_data.py
File metadata and controls
46 lines (34 loc) · 1.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pandas as pd
import numpy as np
import os
folder_path = 'data_clean'
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
def calculate_log_difference(folder_path, column_name):
log_diff_data = []
file_names = []
for filename in os.listdir(folder_path):
if filename.endswith('.csv'):
file_path = os.path.join(folder_path, filename)
df = pd.read_csv(file_path)
df['CH_TIMESTAMP'] = pd.to_datetime(df['CH_TIMESTAMP'])
df.set_index('CH_TIMESTAMP', inplace=True)
if column_name in df.columns:
log_diff = np.log(df[column_name]) - np.log(df[column_name].shift(1))
log_diff_data.append(log_diff)
file_names.append(os.path.splitext(filename)[0])
print(f"Log df of {filename} calculated")
log_diff_df = pd.concat(log_diff_data, axis=1)
log_diff_df.columns = file_names
return log_diff_df
def calculate_correlation(log_diff_df, output_path):
correlation_matrix = log_diff_df.corr()
correlation_df = correlation_matrix.unstack().reset_index()
correlation_df.columns = ['FILENAME1', 'FILENAME2', 'CORRELATION']
correlation_df.to_csv(output_path, index=False)
column_names = ['CH_TRADE_HIGH_PRICE', 'CH_TRADE_LOW_PRICE',
'CH_OPENING_PRICE', 'CH_TOT_TRADED_QTY',
'CH_CLOSING_PRICE', 'CH_TOT_TRADED_VAL']
for colname in column_names:
output_path = 'correlation_data/correlation_log_diff_' + colname +'.csv'
log_diff_df = calculate_log_difference(folder_path, colname)
calculate_correlation(log_diff_df, output_path)