使用 exists 运算符避免 DBI 重复主键
Avoid DBI duplicate primary key with exists operator
我想将 FASTA 文件中的信息插入 MySQL 数据库中的 table。我使用 Ensembl_id
列作为主键。
我的一些 Ensembl_id
不是唯一的,所以我尝试使用 exists
运算符来解决这个问题。但是 table 中只插入了 5 行,其中只有 1 行具有重复的 Ensembl_id
值。
#!/usr/bin/perl -w
#usage script.pl <username> <password> <database_name> <mouse_genes> <mouse_transcripts>
use DBI;
use Data::Dumper;
my $user = shift @ARGV or die $!;
my $password = shift @ARGV or die $!;
my $database = shift @ARGV or die $!;
my $mouse_genes = shift @ARGV or die $!;
my $mouse_transcripts = shift @ARGV or die $!;
my $dbh = DBI->connect( "dbi:mysql:$database:localhost", "$user", "$password",
{ RaiseError => 1 } );
my %gene;
$/ = "\n>";
open( FILE, "gzip -d -c /data.dash/class2016/student/Mus_musculus.GRCm38.cdna.all.fa.gz |" )
or die $!;
LOOP:
while ( <FILE> ) {
my $line = $_;
chomp $line;
if ( $line =~ /[a-z]/ ) {
my @array = split( "\t", $line );
if ( m/gene:(\w+\d+\.\w+)/ ) {
my $Ensembl_id = ;
if ( !exists $gene{$Ensembl_id} ) {
$gene{$Ensembl_id} = 1;
}
else {
next;
}
if ( m/gene_biotype:(\w+)/ ) {
my $gene_biotype = ;
if ( m/gene_symbol:(\w+\D\d+)/ ) {
my $gene_symbol = ;
if ( m/description:(\w+\s+\w+\s+\w+\s+)/ ) {
my $gene_description = ;
if ( m/MGI:(\d+)/ ) {
my $MGI_accession = ;
my $sth = $dbh->prepare(
qq{insert into $mouse_genes (Ensembl_id,gene_biotype,gene_symbol,gene_description,MGI_accession) values ("$Ensembl_id","$gene_biotype","$gene_symbol","$gene_description","$MGI_accession")}
);
$sth->execute();
$sth->finish();
next LOOP;
}
}
}
}
}
}
}
close FILE;
$dbh->disconnect();
如果主键 $Ensembl_id
重复,我如何使用 exists
运算符移动到文件的下一行?
我以为我看到了一个和这个很相似的问题,但是我找不到了
解决方案是忘记哈希并使用 IGNORE
关键字来避免引发错误。 MySQL documentation 表示:
If you use the IGNORE keyword, errors that occur while executing the INSERT statement are ignored. For example, without IGNORE, a row that duplicates an existing UNIQUE index or PRIMARY KEY value in the table causes a duplicate-key error and the statement is aborted. With IGNORE, the row is discarded and no error occurs. Ignored errors may generate warnings instead, although duplicate-key errors do not.
您还应该在 SQL 语句中使用占位符,因此它应该如下所示
注意 END_SQL
必须前后不带空格。您可能希望在程序顶部定义 SQL 语句以避免破坏缩进
my $sth = $dbh->prepare(<<END_SQL);
INSERT IGNORE INTO $mouse_genes (
Ensembl_id,
gene_biotype,
gene_symbol,
gene_description,
MGI_accession
)
VALUES ( ?, ?, ?, ?, ? )
END_SQL
$sth->execute($Ensembl_id, $gene_biotype, $gene_symbol, $gene_description, $MGI_accession);
更新
您的程序可以整理很多,使其更易于阅读。我会这样写
#!/usr/bin/perl
use strict;
use warnings 'all';
# usage script.pl <username> <password> <database_name> <mouse_genes> <mouse_transcripts>
use DBI;
my $user = shift @ARGV or die $!;
my $password = shift @ARGV or die $!;
my $database = shift @ARGV or die $!;
my $mouse_genes = shift @ARGV or die $!;
my $mouse_transcripts = shift @ARGV or die $!; # Not used at present
my $dbh = DBI->connect( "dbi:mysql:$database:localhost", $user, $password,
{ RaiseError => 1, PrintError => 0 } );
my $sth = $dbh->prepare( <<END_SQL );
INSERT IGNORE INTO $mouse_genes (
Ensembl_id,
gene_biotype,
gene_symbol,
gene_description,
MGI_accession
)
VALUES ( ?, ?, ?, ?, ? )
END_SQL
my $cmd = 'gzip -d -c /data.dash/class2016/student/Mus_musculus.GRCm38.cdna.all.fa.gz';
open my $cmd_fh, '-|', $cmd or die $!;
$/ = "\n>";
while ( <$cmd_fh> ) {
next unless my ( $ensembl_id ) = /gene:(\w+\d+\.\w+)/;
next unless my ( $gene_biotype ) = /gene_biotype:(\w+)/;
next unless my ( $gene_symbol ) = /gene_symbol:(\w+\D\d+)/;
next unless my ( $gene_description ) = /description:(\w+\s+\w+\s+\w+)\s/;
next unless my ( $mgi_accession ) = /MGI:(\d+)/;
$sth->execute( $ensembl_id, $gene_biotype, $gene_symbol, $gene_description, $mgi_accession );
}
$dbh->disconnect;
我想出了如何使用散列来克服重复的键:
#!/usr/bin/perl -w
#this script inserts sequences from Mus_musculus.GRCm38.cdna.all.fa.gz into mouse_genes table
#usage lab5_2.pl <username> <password> <database_name> <mouse_genes> <mouse_transcripts>
use DBI;
use Data::Dumper;
my $user = shift @ARGV or die $!;
my $password = shift @ARGV or die $!;
my $database = shift @ARGV or die $!;
my $mouse_genes = shift @ARGV or die $!;
my $dbh = DBI->connect("dbi:mysql:$database:localhost",
"$user",
"$password",
{RaiseError => 1}
);
my %gene;
$/ = "\n>";
open (FILE, "gzip -d -c /data.dash/class2016/student/Mus_musculus.GRCm38.cdna.all.fa.gz |") or die $!;
LOOP: while (<FILE>) {
if (m/gene:(\w+\d+\.\d+)/) {
my $Ensembl_id = ;
if ( !exists $gene{$Ensembl_id} ) {
$gene{$Ensembl_id} = 1;
if (m/gene_biotype:(\w+)/) {
my $gene_biotype = ;
my $gene_symbol;
if (m/gene_symbol:(\w+\D\d+)/) {
$gene_symbol = ;
}
if (! defined $gene_symbol) {
$gene_symbol = "NULL";
}
if (m/description:([^\[]*)/) {
my $gene_description = ;
if (m/MGI:(\d+)/) {
my $MGI_accession = ;
$sth = $dbh->prepare (qq{insert into mouse_genes (Ensembl_id, gene_biotype, gene_symbol, gene_description, MGI_accession) values ("$Ensembl_id","$gene_biotype","$gene_symbol","$gene_description","$MGI_accession")});
$sth->execute();
$sth->finish();
next LOOP;
}
}
}
close FILE;
$dbh->disconnect ();
我想将 FASTA 文件中的信息插入 MySQL 数据库中的 table。我使用 Ensembl_id
列作为主键。
我的一些 Ensembl_id
不是唯一的,所以我尝试使用 exists
运算符来解决这个问题。但是 table 中只插入了 5 行,其中只有 1 行具有重复的 Ensembl_id
值。
#!/usr/bin/perl -w
#usage script.pl <username> <password> <database_name> <mouse_genes> <mouse_transcripts>
use DBI;
use Data::Dumper;
my $user = shift @ARGV or die $!;
my $password = shift @ARGV or die $!;
my $database = shift @ARGV or die $!;
my $mouse_genes = shift @ARGV or die $!;
my $mouse_transcripts = shift @ARGV or die $!;
my $dbh = DBI->connect( "dbi:mysql:$database:localhost", "$user", "$password",
{ RaiseError => 1 } );
my %gene;
$/ = "\n>";
open( FILE, "gzip -d -c /data.dash/class2016/student/Mus_musculus.GRCm38.cdna.all.fa.gz |" )
or die $!;
LOOP:
while ( <FILE> ) {
my $line = $_;
chomp $line;
if ( $line =~ /[a-z]/ ) {
my @array = split( "\t", $line );
if ( m/gene:(\w+\d+\.\w+)/ ) {
my $Ensembl_id = ;
if ( !exists $gene{$Ensembl_id} ) {
$gene{$Ensembl_id} = 1;
}
else {
next;
}
if ( m/gene_biotype:(\w+)/ ) {
my $gene_biotype = ;
if ( m/gene_symbol:(\w+\D\d+)/ ) {
my $gene_symbol = ;
if ( m/description:(\w+\s+\w+\s+\w+\s+)/ ) {
my $gene_description = ;
if ( m/MGI:(\d+)/ ) {
my $MGI_accession = ;
my $sth = $dbh->prepare(
qq{insert into $mouse_genes (Ensembl_id,gene_biotype,gene_symbol,gene_description,MGI_accession) values ("$Ensembl_id","$gene_biotype","$gene_symbol","$gene_description","$MGI_accession")}
);
$sth->execute();
$sth->finish();
next LOOP;
}
}
}
}
}
}
}
close FILE;
$dbh->disconnect();
如果主键 $Ensembl_id
重复,我如何使用 exists
运算符移动到文件的下一行?
我以为我看到了一个和这个很相似的问题,但是我找不到了
解决方案是忘记哈希并使用 IGNORE
关键字来避免引发错误。 MySQL documentation 表示:
If you use the IGNORE keyword, errors that occur while executing the INSERT statement are ignored. For example, without IGNORE, a row that duplicates an existing UNIQUE index or PRIMARY KEY value in the table causes a duplicate-key error and the statement is aborted. With IGNORE, the row is discarded and no error occurs. Ignored errors may generate warnings instead, although duplicate-key errors do not.
您还应该在 SQL 语句中使用占位符,因此它应该如下所示
注意 END_SQL
必须前后不带空格。您可能希望在程序顶部定义 SQL 语句以避免破坏缩进
my $sth = $dbh->prepare(<<END_SQL);
INSERT IGNORE INTO $mouse_genes (
Ensembl_id,
gene_biotype,
gene_symbol,
gene_description,
MGI_accession
)
VALUES ( ?, ?, ?, ?, ? )
END_SQL
$sth->execute($Ensembl_id, $gene_biotype, $gene_symbol, $gene_description, $MGI_accession);
更新
您的程序可以整理很多,使其更易于阅读。我会这样写
#!/usr/bin/perl
use strict;
use warnings 'all';
# usage script.pl <username> <password> <database_name> <mouse_genes> <mouse_transcripts>
use DBI;
my $user = shift @ARGV or die $!;
my $password = shift @ARGV or die $!;
my $database = shift @ARGV or die $!;
my $mouse_genes = shift @ARGV or die $!;
my $mouse_transcripts = shift @ARGV or die $!; # Not used at present
my $dbh = DBI->connect( "dbi:mysql:$database:localhost", $user, $password,
{ RaiseError => 1, PrintError => 0 } );
my $sth = $dbh->prepare( <<END_SQL );
INSERT IGNORE INTO $mouse_genes (
Ensembl_id,
gene_biotype,
gene_symbol,
gene_description,
MGI_accession
)
VALUES ( ?, ?, ?, ?, ? )
END_SQL
my $cmd = 'gzip -d -c /data.dash/class2016/student/Mus_musculus.GRCm38.cdna.all.fa.gz';
open my $cmd_fh, '-|', $cmd or die $!;
$/ = "\n>";
while ( <$cmd_fh> ) {
next unless my ( $ensembl_id ) = /gene:(\w+\d+\.\w+)/;
next unless my ( $gene_biotype ) = /gene_biotype:(\w+)/;
next unless my ( $gene_symbol ) = /gene_symbol:(\w+\D\d+)/;
next unless my ( $gene_description ) = /description:(\w+\s+\w+\s+\w+)\s/;
next unless my ( $mgi_accession ) = /MGI:(\d+)/;
$sth->execute( $ensembl_id, $gene_biotype, $gene_symbol, $gene_description, $mgi_accession );
}
$dbh->disconnect;
我想出了如何使用散列来克服重复的键:
#!/usr/bin/perl -w
#this script inserts sequences from Mus_musculus.GRCm38.cdna.all.fa.gz into mouse_genes table
#usage lab5_2.pl <username> <password> <database_name> <mouse_genes> <mouse_transcripts>
use DBI;
use Data::Dumper;
my $user = shift @ARGV or die $!;
my $password = shift @ARGV or die $!;
my $database = shift @ARGV or die $!;
my $mouse_genes = shift @ARGV or die $!;
my $dbh = DBI->connect("dbi:mysql:$database:localhost",
"$user",
"$password",
{RaiseError => 1}
);
my %gene;
$/ = "\n>";
open (FILE, "gzip -d -c /data.dash/class2016/student/Mus_musculus.GRCm38.cdna.all.fa.gz |") or die $!;
LOOP: while (<FILE>) {
if (m/gene:(\w+\d+\.\d+)/) {
my $Ensembl_id = ;
if ( !exists $gene{$Ensembl_id} ) {
$gene{$Ensembl_id} = 1;
if (m/gene_biotype:(\w+)/) {
my $gene_biotype = ;
my $gene_symbol;
if (m/gene_symbol:(\w+\D\d+)/) {
$gene_symbol = ;
}
if (! defined $gene_symbol) {
$gene_symbol = "NULL";
}
if (m/description:([^\[]*)/) {
my $gene_description = ;
if (m/MGI:(\d+)/) {
my $MGI_accession = ;
$sth = $dbh->prepare (qq{insert into mouse_genes (Ensembl_id, gene_biotype, gene_symbol, gene_description, MGI_accession) values ("$Ensembl_id","$gene_biotype","$gene_symbol","$gene_description","$MGI_accession")});
$sth->execute();
$sth->finish();
next LOOP;
}
}
}
close FILE;
$dbh->disconnect ();