Source code for lcdblib.utils.chrom_convert

#!/usr/bin/env python
""" Converts between chromosome names. """
import sys
import argparse
from argparse import RawDescriptionHelpFormatter as Raw
import pkg_resources
import gzip

import pandas as pd
import pysam
import pybedtools
from Bio import SeqIO


[docs]def arguments():
    """ Function to pull command line arguments """

    DESCRIPTION = """\
    Converts between chromosome names.

    In D. melanogaster there are four common varieties of chromosomes names
    that are used.

    * FlyBase style: 2L, 2R, 3L, etc.
    * UCSC style: chr2L, chr2R, chr3L, etc.
    * RefSeq style: NT_033779.5, NT_033778.4, NT_037436.4, etc.
    * GenBank style: AE014134.6, AE013599.5, AE014296.5, etc.

    Each style has its benefits: FlyBase is the standard, UCSC is compatible
    with the genome browser, and RefSeq and GenBank are the most explicit. It
    is common to come across files that use these different styles of
    chromosome name, this tool aims to easily convert chromosome names in a
    variety of file format.

    """

    parser = argparse.ArgumentParser(description=DESCRIPTION,
                                     formatter_class=Raw)

    parser.add_argument("--from", dest="orig", action='store', required=True,
                        choices=['FlyBase', 'UCSC', 'RefSeq', 'GenBank'],
                        help="Current type of chromosome name.")

    parser.add_argument("--to", dest="new", action='store', required=True,
                        choices=['FlyBase', 'UCSC', 'RefSeq', 'GenBank'],
                        help="The type of chromosome name wanted.")

    parser.add_argument("--fileType", dest="type", action='store',
                        required=True,
                        choices=['SAM', 'BAM', 'BED', 'GFF', 'GTF', 'FASTA'],
                        help="What is the input format.")

    parser.add_argument("-i", "--input", dest="input", action='store',
                        required=True,
                        help="""Input file to convert. If `-i -`, STDIN is
                        used. Note if using SAM/BAM with STDIN, you must
                        include headrs.""")

    parser.add_argument("-o", "--output", dest="output", action='store',
                        required=False, default='-',
                        help="Output file, if none given or `-o -` "
                        "then STDOUT.")

    parser.add_argument("--debug", dest="debug", action='store_true',
                        required=False, help="Enable debug output.")

    return parser.parse_args()


[docs]def import_conversion(f, t):
    """
    Import NCBI conversion table.

    Parameters
    ----------
    f: str {FlyBase, UCSC, NCBI}
        The current chromosome format.
    t: str {FlyBase, UCSC, NCBI}
        The desired chromosome format.

    Returns
    -------
    dict: Mapping {f: t}

    """

    # Get location of the file
    gz = pkg_resources.resource_filename(
        'lcdblib',
        'data/GCF_000001215.4.assembly.txt.gz'
    )

    # Import file
    df = pd.read_csv(gz, compression='gzip', comment='#', sep='\t',
                     header=None)

    # Make mapping from keyword to column number
    # col#   Header
    # 0      Sequence-Name
    # 1      Sequence-Role
    # 2      Assigned-Molecule
    # 3      Assigned-Molecule-Location/Type
    # 4      GenBank-Accn
    # 5      Relationship
    # 6      RefSeq-Accn
    # 7      Assembly-Unit
    # 8      Sequence-Length
    # 9      UCSC-style-name
    mapping = {'FlyBase': 0, 'UCSC': 9, 'GenBank': 4, 'RefSeq': 6}

    # This table has a small error were FlyBase mitochondrion_genome is MT
    df.replace('MT', 'mitochondrion_genome', inplace=True)

    return {k: v for k, v in df[[mapping[f], mapping[t]]].values}


[docs]def pysam_convert(input, output, kind, mapper):
    """
    Use pysam to convert chromosomes in BAM and SAM files.

    pysam uses a header to define chromosomes, then each read is just mapped
    back to this header. Only the header needs to be modified, and reads
    need to be written to the new output file which uses this header.

    """
    # Determine SAM or BAM flags
    if kind == 'BAM':
        flag_in = 'rb'
        flag_out = 'wb'
    elif kind == 'SAM':
        flag_in = 'r'
        flag_out = 'wh'

    curr = pysam.AlignmentFile(input, flag_in)

    # Change chromosome in the header
    header = curr.header
    for chrom in header['SQ']:
        chrom['SN'] = mapper[chrom['SN']]

    with pysam.AlignmentFile(output, flag_out, header=header) as OUT:
        for read in curr:
            OUT.write(read)


[docs]def convertFeature(f, mapper):
    f.chrom = mapper[f.chrom]
    return f


[docs]def pybedtools_convert(input, output, mapper):
    """ Use pybedtools to convert chromosomes in BED, GTF, or GFF. """
    if input == '-':
        # Use STDIN
        bt = pybedtools.BedTool(sys.stdin)
    else:
        bt = pybedtools.BedTool(input)

    if output == '-':
        # Use STDOUT
        print(bt.each(convertFeature, mapper))
    else:
        bt.each(convertFeature, mapper).saveas(output)


[docs]def fasta_convert(input, output, mapper):
    """ Uses Biopython.SeqIO to convert FASTA headers. """

    if input == '-':
        # Use STDIN
        fh = sys.stdin
    elif input.endswith('.gz'):
        fh = gzip.open(input, 'rt')
    else:
        fh = open(input, 'r')

    if output == '-':
        # Use STDOUT
        oh = sys.stdout
    elif output.endswith('.gz'):
        oh = gzip.open(input, 'wb')
    else:
        oh = open(output, 'w')

    for seq in SeqIO.parse(fh, 'fasta'):
        seq.description = seq.description.replace(seq.id, mapper[seq.id])
        seq.name = mapper[seq.name]
        seq.id = mapper[seq.id]
        SeqIO.write(seq, oh, 'fasta')

    # close file handlers
    fh.close()
    oh.close()


[docs]def main():
    # Import commandline arguments.
    args = arguments()

    # Get mapping dict
    mapper = import_conversion(args.orig, args.new)

    if (args.type == 'BAM') | (args.type == 'SAM'):
        pysam_convert(args.input, args.output, args.type, mapper)
    elif (args.type == 'BED') | (args.type == 'GFF') | (args.type == 'GTF'):
        pybedtools_convert(args.input, args.output, mapper)
    elif (args.type == 'FASTA'):
        fasta_convert(args.input, args.output, mapper)