Source code for tfmp

#!/usr/bin/env python
"""
pytfmpval contains Python wrappers for the TFM-Pvalue program. It allows for
accurate calculation of p-values associated with scores for a given motif PWM.
Naturally, it can also be used to calculate the score corresponding to a given
p-value for a transcription factor motif PWM.

The original program can be downloaded at:
http://bioinfo.lifl.fr/tfm-pvalue/tfm-pvalue.php

Additionally, see this paper for how the p-values and thresholds are calculated:
Efficient and accurate P-value computation for Position Weight Matrices
H. Touzet and J.S. Varre
Algorithms for Molecular Biology 2007, 2:15

Copyright (C) 2017  Jared Andrews

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

from __future__ import absolute_import, division, print_function, unicode_literals
import pytfmpval.pytfmpval as tfm
from math import ceil


[docs]def create_matrix(matrix_file, bg=[0.25, 0.25, 0.25, 0.25], mat_type="counts", log_type="nat"): """ From a JASPAR formatted motif matrix count file, create a Matrix object. This function also converts it to a log-odds (position weight) matrix if necessary. Args: matrix_file (str): White-space delimited string of row-concatenated motif matrix. bg (list of floats): Background nucleotide frequencies for [A, C, G, T]. mat_type (str): Type of motif matrix provided. Options are: "counts", "pfm", "pwm". "counts" is for raw count matrices for each base at each position. "pfm" is for position frequency matrices (frequencies already calculated. "pwm" is for position weight matrices (also referred to as position-specific scoring matrices.) log_type (str): Base to use for log. Default is to use the natural log. "log2" is the other option. This will affect the scores and p-values. Returns: m (pytfmpval Matrix): Matrix in pwm format. """ a, c, g, t = bg[0], bg[1], bg[2], bg[3] m = tfm.Matrix(a, c, g, t) m.readJasparMatrix(matrix_file) if mat_type.upper() == "COUNTS": if log_type.upper() == "NAT": m.toLogOddRatio() elif log_type.upper() == "LOG2": m.toLog2OddRatio() else: print("Improper log type argument, using natural log.") m.toLogOddRatio() return m
[docs]def read_matrix(matrix, bg=[0.25, 0.25, 0.25, 0.25], mat_type="counts", log_type="nat"): """ From a string of space-delimited counts create a Matrix object. Break the string into 4 rows corresponding to A, C, G, and T. This function also converts it to a log-odds (position weight) matrix if necessary. Args: matrix_file (str): White-space delimited string of row-concatenated motif matrix. bg (list of floats): Background nucleotide frequencies for [A, C, G, T]. mat_type (str): Type of motif matrix provided. Options are: "counts", "pfm", "pwm". "counts" is for raw count matrices for each base at each position. "pfm" is for position frequency matrices (frequencies already calculated). "pwm" is for position weight matrices (also referred to as position-specific scoring matrices.) log_type (str): Base to use for log. Default is to use the natural log. "log2" is the other option. This will affect the scores and p-values. Returns: m (pytfmpval Matrix): Matrix in pwm format. """ try: a, c, g, t = bg[0], bg[1], bg[2], bg[3] if (len(matrix.split()) % 4) != 0: raise ValueError("Uneven rows in motif matrix. Ensure rows of equal length in input.") m = tfm.Matrix(a, c, g, t) m.readMatrix(matrix) if mat_type.upper() == "COUNTS": if log_type.upper() == "NAT": m.toLogOddRatio() elif log_type.upper() == "LOG2": m.toLog2OddRatio() else: print("Improper log type argument, using natural log.") m.toLogOddRatio() return m except ValueError as error: print(repr(error))
[docs]def score2pval(matrix, req_score): """ Determine the p-value for a given score for a specific motif PWM. Args: matrix (pytfmpval Matrix): Matrix in pwm format. req_score (float): Requested score for which to determine the p-value. Returns: ppv (float): The calculated p-value corresponding to the score. """ granularity = 0.1 max_granularity = 1e-10 decrgr = 10 # Factor to increase granularity by after each iteration. pv = tfm.doublep() ppv = tfm.doublep() while granularity > max_granularity: matrix.computesIntegerMatrix(granularity) max_s = int(req_score * matrix.granularity + matrix.offset + matrix.errorMax + 1) min_s = int(req_score * matrix.granularity + matrix.offset - matrix.errorMax - 1) score = int(req_score * matrix.granularity + matrix.offset) matrix.lookForPvalue(score, min_s, max_s, ppv, pv) if ppv.value() == pv.value(): return ppv.value() granularity = granularity / decrgr print("Max granularity exceeded. Returning closest approximation.") return ppv.value()
[docs]def pval2score(matrix, pval): """ Determine the score for a given p-value for a specific motif PWM. Args: matrix (pytfmpval Matrix): Matrix in pwm format. pval (float): p-value for which to determine the score. Returns: score (float): The calculated score corresponding to the p-value. """ init_granularity = 0.1 max_granularity = 1e-10 decrgr = 10 # Factor to increase granularity by after each iteration. pv = tfm.doublep() # Initialize as a c++ double. ppv = tfm.doublep() matrix.computesIntegerMatrix(init_granularity) max_s = int(matrix.maxScore + ceil(matrix.errorMax + 0.5)) min_s = int(matrix.minScore) granularity = init_granularity while granularity > max_granularity: matrix.computesIntegerMatrix(granularity) score = matrix.lookForScore(min_s, max_s, pval, pv, ppv) min_s = int((score - ceil(matrix.errorMax + 0.5)) * decrgr) max_s = int((score + ceil(matrix.errorMax + 0.5)) * decrgr) if ppv.value() == pv.value(): break granularity = granularity / decrgr if granularity <= max_granularity: print("Max granularity exceeded. Returning closest score approximation.") final_score = (score - matrix.offset) / matrix.granularity return final_score