qiime代码总结
-
代码版本:2020.02
前言: 拆解qiime代码实现,梳理qiime逻辑
-
qiime工件是如何输出的
代码路径:
qiime2.core.archive.format源码v0:
import collections import uuid as _uuid import yaml import qiime2.sdk as sdk # Allow OrderedDict to be serialized for YAML representation yaml.add_representer(collections.OrderedDict, lambda dumper, data: dumper.represent_dict(data.items())) class ArchiveFormat: DATA_DIR = 'data' METADATA_FILE = 'metadata.yaml' @classmethod def _parse_metadata(self, fh, expected_uuid): metadata = yaml.safe_load(fh) if metadata['uuid'] != str(expected_uuid): raise ValueError( "Archive root directory must match UUID present in archive's" " metadata: %s != %s" % (expected_uuid, metadata['uuid'])) return metadata['uuid'], metadata['type'], metadata['format'] @classmethod def _format_metadata(self, fh, uuid, type, format): metadata = collections.OrderedDict() metadata['uuid'] = str(uuid) metadata['type'] = repr(type) metadata['format'] = None if format is not None: metadata['format'] = format.__name__ fh.write(yaml.dump(metadata, default_flow_style=False)) @classmethod def load_metadata(self, archive): with archive.open(self.METADATA_FILE) as fh: return self._parse_metadata(fh, expected_uuid=archive.uuid) @classmethod def write(cls, archive_record, type, format, data_initializer, _): root = archive_record.root metadata_fp = root / cls.METADATA_FILE with metadata_fp.open(mode='w') as fh: cls._format_metadata(fh, archive_record.uuid, type, format) data_dir = root / cls.DATA_DIR data_dir.mkdir() data_initializer(data_dir) def __init__(self, archive_record): path = archive_record.root with (path / self.METADATA_FILE).open() as fh: uuid, type, format = \ self._parse_metadata(fh, expected_uuid=archive_record.uuid) self.uuid = _uuid.UUID(uuid) self.type = sdk.parse_type(type) self.format = sdk.parse_format(format) self.path = path self.data_dir = path / self.DATA_DIR源码v1:
import qiime2.core.archive.format.v0 as v0 class ArchiveFormat(v0.ArchiveFormat): PROVENANCE_DIR = 'provenance' @classmethod def write(cls, archive_record, type, format, data_initializer, provenance_capture): super().write(archive_record, type, format, data_initializer, provenance_capture) root = archive_record.root prov_dir = root / cls.PROVENANCE_DIR prov_dir.mkdir() provenance_capture.finalize( prov_dir, [root / cls.METADATA_FILE, archive_record.version_fp]) def __init__(self, archive_record): super().__init__(archive_record) self.provenance_dir = archive_record.root / self.PROVENANCE_DIR源码v2:
import qiime2.core.archive.format.v1 as v1 class ArchiveFormat(v1.ArchiveFormat): # Exactly the same as v1, but in provenance, when the action type isn't # import, there is an `output-name` key in the action section with that # node's output name according to the action's signature object. Also has # pipeline action types. pass源码v3:
import qiime2.core.archive.format.v2 as v2 class ArchiveFormat(v2.ArchiveFormat): # Exactly the same as v2, but inputs may be variadic where the UUIDs are in # a YAML sequence. Additionally `Set` is now represented as a sequence # with a custom !set tag. pass源码v4:
class ArchiveFormat(v3.ArchiveFormat): # - Adds a transformers section to action.yaml # - Adds citations via the !cite yaml type which references the # /provenance/citations.bib file (this is nested like everything else # in the /provenance/artifacts/ # directories). # - environment:framework has been updated to be a nested object, # its schema is identical to a environment:plugins:<entry> object. # Prior to v4, it was only a version string. @property def citations(self): files = [] files.append(str(self.provenance_dir / 'citations.bib')) if (self.provenance_dir / 'artifacts').exists(): for ancestor in (self.provenance_dir / 'artifacts').iterdir(): if (ancestor / 'citations.bib').exists(): files.append(str(ancestor / 'citations.bib')) citations = Citations() for f in files: citations.update(Citations.load(f)) return citations源码v5:
class ArchiveFormat(v4.ArchiveFormat): CHECKSUM_FILE = 'checksums.md5' # Adds `checksums.md5` to root of directory structure @classmethod def write(cls, archive_record, type, format, data_initializer, provenance_capture): super().write(archive_record, type, format, data_initializer, provenance_capture) checksums = md5sum_directory(str(archive_record.root)) with (archive_record.root / cls.CHECKSUM_FILE).open('w') as fh: for item in checksums.items(): fh.write(to_checksum_format(*item)) fh.write('\n')代码梳理总结
通过复写write方法并进行父级调用使不同版本的ArchiveFormat类联系起来
v0: 生成data目录,在根节点目录下输出 metadata.yaml 文件
v1: 生成provenance目录,并在provenance目录下输出出处相关文件(action,artifacts等)
v2: pass
v3: pass
v4: 输出citation
v5: 输出checksums.md5文件,记录所有文件的md5信息最后:
qiime2.core.archive.archiver.pyArchiver类通过传递version使用6个不同的工件格式类class Archiver: CURRENT_FORMAT_VERSION = '5' CURRENT_ARCHIVE = _ZipArchive _FORMAT_REGISTRY = { # NOTE: add more archive formats as things change '0': 'qiime2.core.archive.format.v0:ArchiveFormat', '1': 'qiime2.core.archive.format.v1:ArchiveFormat', '2': 'qiime2.core.archive.format.v2:ArchiveFormat', '3': 'qiime2.core.archive.format.v3:ArchiveFormat', '4': 'qiime2.core.archive.format.v4:ArchiveFormat', '5': 'qiime2.core.archive.format.v5:ArchiveFormat' } .... @classmethod def get_format_class(cls, version): try: imp, fmt_cls = cls._FORMAT_REGISTRY[version].split(':') except KeyError: return None return getattr(importlib.import_module(imp), fmt_cls) -
常见文件格式读取
读取
fastq.gz文件import gzip import itertools # 参见:q2_demux._dumux.py:function@_read_fastq_seqs def read_fastq_seqs(filepath): fh = gzip.open(filepath, 'rt') for seq_header, seq, qual_header, qual in itertools.zip_longest(*[fh] * 4): yield (seq_header.strip(), seq.strip(), qual_header.strip(), qual.strip())读取
biom文件def read_biom_file(filepath): table = load_table(filepath) # table 获取数据的常用方法示例 ids = table.ids() otu_ids = table.ids(axis='observation') cnts = table.matrix_data.astype(int).toarray().T读取进化树 (文本格式,后缀一般为
tre|nwk等)文件def read_tree_node(filepath) -> skbio.TreeNode: # ignore BOM only when reading, do not emit BOM on write with open(filepath, mode='r', encoding='utf-8-sig') as fh: return skbio.TreeNode.read(fh, format='newick', verify=False)读取距离矩阵(一种
tsv格式)文件def read_distence_matrix(filepath): return skbio.DistanceMatrix.read(filepath, format='lsmat', verify=False)读取PCoA分析结果文件
# 当前无需读取使用该方法,直接从距离矩阵中计算PCoA结果 # master = skbio.stats.ordination.pcoa(dm, method='eigh', inplace=False) def read_ordination_data(filepath): return skbio.OrdinationResults.read(filepath, format='ordination', verify=False)