从基因组 gbff 文件中提取元数据

extract metadata from genomic gbff files

我有 >1000 .gbff.gz 个基因组文件,我想从每个文件中提取元数据,并在单独的列中包含元数据条目。

在 Matlab 中你可以使用 genbankread of the Bioinformatics Toolbox。这是一个如何实现您想要的示例:

results = [];

% unzip data
gunzip('*.gbff.gz');

% process each file
files = dir('*.gbff');
for file = {files.name}
  data = genbankread(char(file));

  % process each file entry
  for i = 1:size(data, 2)
    LocusName = '';
    Definition = '';
    Organism = '';
    GenesTotal = NaN;
    GenesCoding = NaN;
    RRNAs = '';
    TRNAs = NaN;
    IsolationSource = '';
    Country = '';

    % copy fields
    if isfield(data(i), 'LocusName')
      LocusName = data(i).LocusName;
    end
    if isfield(data(i), 'Definition')
      Definition = data(i).Definition;
    end
    if isfield(data(i), 'Source')
      Organism = data(i).Source;
    end

    % parse comments
    if isfield(data(i), 'Comment')
      for j = 1:size(data(i).Comment, 1)
        tokens = regexp(data(i).Comment(j, :), ...
          '^\s*([^\s].*[^\s])\s*::\s*([^\s].*[^\s])\s*$', 'tokens');
        if ~isempty(tokens)
          switch tokens{1}{1}
            case 'Genes (total)'
              GenesTotal = str2double(tokens{1}{2});
            case 'Genes (coding)'
              GenesCoding = str2double(tokens{1}{2});
            case 'rRNAs'
              RRNAs = tokens{1}{2};
            case 'tRNAs'
              TRNAs = str2double(tokens{1}{2});
          end
        end
      end
    end

    % parse features
    if isfield(data(i), 'Features')
      Feature = '';
      for j = 1:size(data(i).Features, 1)
        tokens = regexp(data(i).Features(j, :), '^(\w+)', 'tokens');
        if isempty(tokens)
          tokens = regexp(data(i).Features(j, :), ...
            '^\s+/(\w+)="([^"]+)"', 'tokens');
          if ~isempty(tokens)
            switch Feature
              case 'source'
                switch tokens{1}{1}
                  case 'isolation_source'
                    IsolationSource = tokens{1}{2};
                  case 'country'
                    Country = tokens{1}{2};
                end
            end
          end
        else
          Feature = tokens{1}{1};
        end
      end
    end

    % append entries to results
    results = [results; struct( ...
      'File', char(file), 'LocusName', LocusName, 'Definition', Definition, ...
      'Organism', Organism, 'GenesTotal', GenesTotal, ...
      'GenesCoding', GenesCoding, 'RRNAs', RRNAs, 'TRNAs', TRNAs, ...
      'IsolationSource', IsolationSource, 'Country', Country)];
  end
end

% data is in variable results