binaryNLP/generate_binary_features.py at master · skillachie/binaryNLP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#! /usr/bin/env python

from categorize_articles import CategorizeArticles
from binary_features import CategorySeries
from article_events import Events
from pprint import pprint
import datetime
import os
import sys
import argparse

'''
Script used to generate various NLP binary features
'''

def text_classif_features(directory,match,start,end,outpath,cpus,aggr_freq,add_noise):

    # - Get predictions
    cat_articles = CategorizeArticles()
    results = cat_articles.run(directory,cpus,match,start,end)

    # - Get results from proxy object
    predictions = []
    for result in results:
        predictions.append(result)

    categories_series = CategorySeries(start,end,aggr_freq,add_noise)
    series_result = categories_series.get_category_timeseries(predictions)

    series_result.to_csv('category_features.csv')

    # Get  events
    events = Events(predictions)
    event_results = events.run()

    event_counts = []
    for result in event_results:
      event_counts.append(result)

    # Event counts
    event_series = CategorySeries(start,end,aggr_freq,add_noise)
    event_count_results = event_series.get_event_timeseries(event_counts)
    event_count_results.to_csv('event_features_count.csv')

    # Continuos  and Stop Events
    pprint(event_count_results)
    event_countinuous_results = event_series.get_continuous_events_timeseries(event_counts,5,'daily')
    pprint(event_countinuous_results)

    tmp_d = event_count_results.join(event_countinuous_results, how='outer')

    mpd = series_result.join(tmp_d, how='outer')
    mpd.to_csv(outpath)

def split_date(date_str):
    date_dt = datetime.datetime.strptime(date_str, '%Y-%m-%d')
    return date_dt

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--match', help='directory pattern match',nargs='?',const='\d+-\d+-\d+-\d+-\d+-\d+', type=str,default='\d+-\d+-\d+-\d+-\d+-\d+')
    parser.add_argument('--start_date', help='start date for series',type=str)
    parser.add_argument('--end_date', help='end date to be used',type=str)
    parser.add_argument('--n_cpus', help='# of cores to  be used',nargs='?',const=0,type=int,default=0)
    parser.add_argument('--binary_features_outpath', help='fully qualified file name to be used',nargs='?',const='nlp_category_binary_features.csv',type=str,default='nlp_category_binary_features.csv')
    parser.add_argument('--aggr_freq', help='specify if you want hourly/daily aggregatoins',nargs='?',const='hourly',type=str,default='hourly')
    parser.add_argument('--add_noise', help='add normal distributed noise to data to fix cases when there are no predicted articles for that topic',nargs='?',const=False,type=bool,default=False)
    parser.add_argument('--data_location', help='the location of the news articles folder')
    args = parser.parse_args()

    directory_location = args.data_location
    if not directory_location:
        directory_location= os.path.join(os.path.dirname(__file__), 'data')

    start_date = args.start_date
    if not start_date:
       st = '2009-01-01'
       start_date = split_date(st)
    else:
        start_date = split_date(start_date)

    end_date = args.end_date
    if not end_date:
        ed='2014-12-31'
        end_date = split_date(ed)
    else:
        end_date = split_date(end_date)


    #for arg in args:
    #    if os.path.isdir(arg):
    #        directory_location = arg
    #    elif arg.find('*') > -1:
    #        match = arg
    #    elif arg.find('.csv') > -1:
    #        outpath = arg
    #    elif arg[0].isdigit() and arg.find('/') > -1:
    #        parts = arg.split('_')
    #        datex = parts[0].split('/')
    #        if len(parts) > 1:
    #            timex = parts[1].split(':')
    #        elif start:
    #            timex = [ 13, 59, 99 ]
    #        else:
    #            timex = [ 0, 0, 0 ]
    #            dt=datetime.datetime(int(datex[0]),int(datex[1]),int(datex[2]),
    #                                        int(timex[0]),int(timex[1]),int(timex[2]))
    #        if start:
    #            end = dt
    #        else:
    #            start = dt
    #    elif arg[0].isdigit():
    #        cpus = int(arg)

    #    if not start:
    #            start=datetime.datetime(2009,1,1),
    #    if not end:
    #            end=datetime.datetime(2014,12,31),
    abs_dir_location = os.path.abspath(directory_location)
    text_classif_features(directory_location,args.match,start_date,end_date,args.binary_features_outpath,args.n_cpus,args.aggr_freq,args.add_noise)

if __name__ == '__main__':
        #main(sys.argv[1:])
        main()