GRNformer/GenerateInput.py at main · BioinfoMachineLearning/GRNformer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
'''
The source is inspired by the BEELINE
Pratapa, A., Jalihal, A.P., Law, J.N., Bharadwaj, A., Murali, T. M. (2020)
"Benchmarking algorithms for gene regulatory network inference from single-cell transcriptomic data." Nature Methods, 17, 147–154.

'''

from __future__ import print_function

import argparse
import pandas as pd
from itertools import product, permutations, combinations, combinations_with_replacement
import numpy as np
import sys
import os

def get_parser() -> argparse.ArgumentParser:
    '''
    :return: an argparse ArgumentParser object for parsing command
        line parameters
    '''
    parser = argparse.ArgumentParser(description='Generate experimental scRNA-seq inputs for BEELINE.', epilog = 'Example usage to generate dataset with all TFs and 500 genes (TFs+500): python generateInputs.py -e=ExpressionData.csv -g=GeneOrdering.csv -f=STRING-network.csv -i=human-tfs.csv -p=0.01 -c -t -n=500 -o=temp')

    parser.add_argument('-e','--expFile', type = str,
                        default = 'ExpressionData.csv',
                        help='Path to expression data file. Required. \n')

    parser.add_argument('-g','--geneOrderingFile', type = str,
                        default = 'GeneOrdering.csv',
                        help='Path to gene ordering file. Required. \n')

    parser.add_argument('-f','--netFile', type = str,
                        default = 'STRING-network.csv',
                        help='Path to network file to print network statistics. Optional. \n')

    parser.add_argument('-i','--TFFile', type = str,
                        default = 'human-tfs.csv',
                        help='Path to file containing list of TFs. Required. \n')

    parser.add_argument('-p','--pVal', type=float, default = 0.01,
                        help='p-value cutoff. Default = 0.01')


    parser.add_argument('-c','--BFcorr', action='store_true', default = False,
                        help='Perform Bonferroni correction. Default = False. \n')


    parser.add_argument('-n','--numGenes', type=int, default = 500,
                        help='Number of genes to add. Default=500. \n')


    parser.add_argument('-t','--TFs', action='store_true', default = False,
                        help='Add all significantly varying TFs. Default = False.\n')

    parser.add_argument('-o','--outPrefix', type = str, default = 'BL-',
                        help='Prefix for writing output files. \n')

    return parser

def parse_arguments():
    '''
    Initialize a parser and use it to parse the command line arguments
    :return: parsed dictionary of command line arguments
    '''
    parser = get_parser()
    opts = parser.parse_args()

    return opts

opts = parse_arguments()

include_tfs = opts.TFs
expr_file = os.path.abspath(opts.expFile)
print("\nReading %s" % (expr_file))
gene_ordering_file = os.path.abspath(opts.geneOrderingFile)
psuedotime_file = os.path.dirname(os.path.abspath(opts.expFile))+'/PseudoTime.csv'
tf_file = os.path.abspath(opts.TFFile)
pval_cutoff = opts.pVal
bf_corr = opts.BFcorr
num_genes = opts.numGenes
sort_by_variance = True
print("\nReading %s" % (expr_file))
expr_df = pd.read_csv(expr_file, header=0, index_col=0)
print("\nReading %s" % (gene_ordering_file))
gene_df = pd.read_csv(gene_ordering_file, header=0, index_col=0)
#print(expr_df.head())
#print(gene_df.head())
tfs_df = pd.read_csv(tf_file, header=0)
tfs = tfs_df[tfs_df.columns[0]]
tfs.to_csv(opts.outPrefix+'TFs.csv',index=False)
if include_tfs:
    print("\nReading %s" % (tf_file))
    num_total_tfs = len(tfs)
    # limit the tfs to those present in the expression file
    tfs = tfs[tfs.isin(expr_df.index)]
    print("\t%d/%d TFs present in ExpressionData" % (
        len(tfs), num_total_tfs))

# make sure the variable genes are in the Expression Data
expr_variable_genes = set(gene_df.index.values) & set(expr_df.index.values)
extra_genes = set(gene_df.index.values) - set(expr_df.index.values)
if len(extra_genes) != 0:
    print("\nWARNING: %d variable genes are not in ExpressionData.csv:" % (len(extra_genes)))
    print(extra_genes)
    gene_df = gene_df.loc[expr_variable_genes]
# limit the Expression matrix to the variable genes
pval_col = gene_df.columns[0]
# make sure its sorted by the pvalue column
gene_df.sort_values(by=pval_col, inplace=True)
variable_genes = list(gene_df.index.values)

# now figure out the genes to subset
if pval_cutoff !=0 :
    if bf_corr:
        # divide the pvalue by the # of genes to get the BF-corrected pvalue cutoff
        pval_cutoff = pval_cutoff / float(len(gene_df.index))
        print("\nUsing the BF-corrected p-value cutoff of %s (%s / %s genes)" % (
            pval_cutoff, pval_cutoff*float(len(gene_df.index)), len(gene_df.index)))

    variable_genes = gene_df[gene_df[pval_col] < pval_cutoff].index.values
    print("\n%d genes pass pval_cutoff of %s" % (len(variable_genes), pval_cutoff))
    #print("\nBefore using pValue cut-off num rows: ", gene_df.shape[0] )
    gene_df = gene_df.filter(items = variable_genes, axis='index')
    print("\nAfter using pValue cut-off num rows: ", gene_df.shape[0] )

variable_genes = []
if include_tfs:
    # include the TFs that pass the p-val cutoff
    tfs = tfs[tfs.isin(gene_df.index)]
    if pval_cutoff:
        print("\nIncluding %d TFs that pass the pval cutoff" % (len(tfs)))
    else:
        print("\nIncluding %d TFs" % (len(tfs)))
    variable_tfs = set(tfs)
    gene_df.drop(labels = variable_tfs, axis='index', inplace = True)
else:
    variable_tfs = set()
if num_genes > 0:
    if num_genes > len(gene_df):
        variable_genes_new = gene_df.index
        pass
    else:
        if sort_by_variance:
            #print("\nSorting by variance...")
            if len(gene_df.columns) < 2:
                print("ERROR: no variance column found. Should be 3rd column. Quitting")
                sys.exit()
            var_col = gene_df.columns[1]
            #print("Using the column '%s' as the variance columns" % (var_col))
            # the third column is the variance. Sort by that

            gene_df.sort_values(by=var_col, inplace=True, ascending = False)

        variable_genes_new = gene_df.iloc[:num_genes].index.values
    variable_genes1 = list(set(variable_genes_new) | set(variable_tfs))
    variable_genes = [x.upper() for x in variable_genes1]

print("\nRestricting to %d genes" % (len(variable_genes)))
expr_df = expr_df.loc[variable_genes1]
print("\nNew shape of Expression Data %d x %d" % (expr_df.shape[0],expr_df.shape[1]))
expr_df.index=variable_genes
expr_df.to_csv(opts.outPrefix+'ExpressionData.csv')
#open pseudo-time file from the path as expression data
#check if pseudo-time file is present in the path of expression data

#if os.path.exists(psuedotime_file):
print("\nReading %s" % (psuedotime_file))
pseudoTime = pd.read_csv(psuedotime_file, header=0, index_col=0)
print("\nShape of PseudoTime Data %d x %d" % (pseudoTime.shape[0],pseudoTime.shape[1]))
#subset the pseudo-time data to the cells present in the expression data
pseudoTime = pseudoTime[pseudoTime.index.isin(expr_df.columns)]
pseudoTime.to_csv(opts.outPrefix+'PseudoTime.csv')
print("\nNew shape of PseudoTime Data %d x %d" % (pseudoTime.shape[0],pseudoTime.shape[1]))

if opts.netFile != 'None':
    netFile = os.path.abspath(opts.netFile)
    netDF = pd.read_csv(netFile)
    netDF = netDF[(netDF.Gene1.isin(expr_df.index)) & (netDF.Gene2.isin(expr_df.index))]
    # Remove self-loops.
    netDF = netDF[netDF.Gene1 != netDF.Gene2]

    # Remove duplicates (there are some repeated lines in the ground-truth networks!!!).
    netDF.drop_duplicates(keep = 'first', inplace=True)
    netDF["Type"] = 1
    netDF.to_csv(opts.outPrefix+'refNetwork.csv', index=False)
    allNodes = set(netDF.Gene1.unique()).union(set(netDF.Gene2.unique()))
    nTFs = expr_df[expr_df.index.isin(netDF.Gene1.unique())].shape[0]
    nGenes = expr_df[expr_df.index.isin(allNodes)].shape[0]
    print("\n#TFs: %d, #Genes: %d, #Edges: %d, Density: %.3f" % (nTFs,nGenes,netDF.shape[0],netDF.shape[0]/((nTFs*nGenes)-nTFs)))

print("\n\nExiting...\n")