#!/usr/bin/env python
""" Converts between chromosome names. """
import sys
import argparse
from argparse import RawDescriptionHelpFormatter as Raw
import pkg_resources
import gzip
import pandas as pd
import pysam
import pybedtools
from Bio import SeqIO
[docs]def arguments():
""" Function to pull command line arguments """
DESCRIPTION = """\
Converts between chromosome names.
In D. melanogaster there are four common varieties of chromosomes names
that are used.
* FlyBase style: 2L, 2R, 3L, etc.
* UCSC style: chr2L, chr2R, chr3L, etc.
* RefSeq style: NT_033779.5, NT_033778.4, NT_037436.4, etc.
* GenBank style: AE014134.6, AE013599.5, AE014296.5, etc.
Each style has its benefits: FlyBase is the standard, UCSC is compatible
with the genome browser, and RefSeq and GenBank are the most explicit. It
is common to come across files that use these different styles of
chromosome name, this tool aims to easily convert chromosome names in a
variety of file format.
"""
parser = argparse.ArgumentParser(description=DESCRIPTION,
formatter_class=Raw)
parser.add_argument("--from", dest="orig", action='store', required=True,
choices=['FlyBase', 'UCSC', 'RefSeq', 'GenBank'],
help="Current type of chromosome name.")
parser.add_argument("--to", dest="new", action='store', required=True,
choices=['FlyBase', 'UCSC', 'RefSeq', 'GenBank'],
help="The type of chromosome name wanted.")
parser.add_argument("--fileType", dest="type", action='store',
required=True,
choices=['SAM', 'BAM', 'BED', 'GFF', 'GTF', 'FASTA'],
help="What is the input format.")
parser.add_argument("-i", "--input", dest="input", action='store',
required=True,
help="""Input file to convert. If `-i -`, STDIN is
used. Note if using SAM/BAM with STDIN, you must
include headrs.""")
parser.add_argument("-o", "--output", dest="output", action='store',
required=False, default='-',
help="Output file, if none given or `-o -` "
"then STDOUT.")
parser.add_argument("--debug", dest="debug", action='store_true',
required=False, help="Enable debug output.")
return parser.parse_args()
[docs]def import_conversion(f, t):
"""
Import NCBI conversion table.
Parameters
----------
f: str {FlyBase, UCSC, NCBI}
The current chromosome format.
t: str {FlyBase, UCSC, NCBI}
The desired chromosome format.
Returns
-------
dict: Mapping {f: t}
"""
# Get location of the file
gz = pkg_resources.resource_filename(
'lcdblib',
'data/GCF_000001215.4.assembly.txt.gz'
)
# Import file
df = pd.read_csv(gz, compression='gzip', comment='#', sep='\t',
header=None)
# Make mapping from keyword to column number
# col# Header
# 0 Sequence-Name
# 1 Sequence-Role
# 2 Assigned-Molecule
# 3 Assigned-Molecule-Location/Type
# 4 GenBank-Accn
# 5 Relationship
# 6 RefSeq-Accn
# 7 Assembly-Unit
# 8 Sequence-Length
# 9 UCSC-style-name
mapping = {'FlyBase': 0, 'UCSC': 9, 'GenBank': 4, 'RefSeq': 6}
# This table has a small error were FlyBase mitochondrion_genome is MT
df.replace('MT', 'mitochondrion_genome', inplace=True)
return {k: v for k, v in df[[mapping[f], mapping[t]]].values}
[docs]def pysam_convert(input, output, kind, mapper):
"""
Use pysam to convert chromosomes in BAM and SAM files.
pysam uses a header to define chromosomes, then each read is just mapped
back to this header. Only the header needs to be modified, and reads
need to be written to the new output file which uses this header.
"""
# Determine SAM or BAM flags
if kind == 'BAM':
flag_in = 'rb'
flag_out = 'wb'
elif kind == 'SAM':
flag_in = 'r'
flag_out = 'wh'
curr = pysam.AlignmentFile(input, flag_in)
# Change chromosome in the header
header = curr.header
for chrom in header['SQ']:
chrom['SN'] = mapper[chrom['SN']]
with pysam.AlignmentFile(output, flag_out, header=header) as OUT:
for read in curr:
OUT.write(read)
[docs]def convertFeature(f, mapper):
f.chrom = mapper[f.chrom]
return f
[docs]def fasta_convert(input, output, mapper):
""" Uses Biopython.SeqIO to convert FASTA headers. """
if input == '-':
# Use STDIN
fh = sys.stdin
elif input.endswith('.gz'):
fh = gzip.open(input, 'rt')
else:
fh = open(input, 'r')
if output == '-':
# Use STDOUT
oh = sys.stdout
elif output.endswith('.gz'):
oh = gzip.open(input, 'wb')
else:
oh = open(output, 'w')
for seq in SeqIO.parse(fh, 'fasta'):
seq.description = seq.description.replace(seq.id, mapper[seq.id])
seq.name = mapper[seq.name]
seq.id = mapper[seq.id]
SeqIO.write(seq, oh, 'fasta')
# close file handlers
fh.close()
oh.close()
[docs]def main():
# Import commandline arguments.
args = arguments()
# Get mapping dict
mapper = import_conversion(args.orig, args.new)
if (args.type == 'BAM') | (args.type == 'SAM'):
pysam_convert(args.input, args.output, args.type, mapper)
elif (args.type == 'BED') | (args.type == 'GFF') | (args.type == 'GTF'):
pybedtools_convert(args.input, args.output, mapper)
elif (args.type == 'FASTA'):
fasta_convert(args.input, args.output, mapper)