SPARQL 查询格式
SPARQL query formation
我有 RDF 数据,我想形成一个 SPARQL 查询来获取与特定生物名称匹配的记录。
仅供参考,我使用 RDF4J 使用可用的 JSONLD 数据生成 RDF 记录。
我在获取匹配任何特定 PropertyValue 集的记录时遇到问题。示例:生物体为 Equus caballus 的所有记录或提交标识符为 GSB-7331 的所有记录。
非常感谢任何帮助。
数据记录如下:
@prefix schema: <http://schema.org/> .
@prefix obo: <http://purl.obolibrary.org/obo/> .
@prefix ebi-bsd: <https://www.ebi.ac.uk/biosamples/> .
@prefix biosamples: <http://identifiers.org/biosample/> .
biosamples:SAMEA104496657 a schema:DataRecord ;
schema:dateCreated "0002-10-15T00:00:00Z"^^schema:Date ;
schema:dateModified "2019-07-23T18:33:14.867Z"^^schema:Date ;
schema:identifier "SAMEA104496657" ;
schema:isPartOf ebi-bsd:samples ;
schema:mainEntity _:b0 .
ebi-bsd:samples a schema:Dataset .
_:b0 a schema:Sample , obo:OBI_0000747 ;
schema:additionalProperty _:b1 , _:b2 , _:b3 , _:b4 ;
schema:description "Blood samples N123" ;
schema:identifier "SAMEA104496657" ;
schema:name "N123" ;
schema:sameAs biosamples:SAMEA104496657 .
_:b1 a schema:PropertyValue ;
schema:name "organism" ;
schema:value "Equus caballus" ;
schema:valueReference obo:NCBITaxon_9796 .
obo:NCBITaxon_9796 a schema:DefinedTerm .
_:b2 a schema:PropertyValue ;
schema:name "submission description" ;
schema:value "ELOAD_294_samples" .
_:b3 a schema:PropertyValue ;
schema:name "submission identifier" ;
schema:value "GSB-7331" .
_:b4 a schema:PropertyValue ;
schema:name "submission title" ;
schema:value "ELOAD_294" .
@prefix schema: <http://schema.org/> .
@prefix obo: <http://purl.obolibrary.org/obo/> .
@prefix ebi-bsd: <https://www.ebi.ac.uk/biosamples/> .
@prefix biosamples: <http://identifiers.org/biosample/> .
biosamples:SAMEA104625758 a schema:DataRecord ;
schema:dateCreated "0014-06-07T00:00:00Z"^^schema:Date ;
schema:dateModified "2019-08-06T17:46:01.812Z"^^schema:Date ;
schema:identifier "SAMEA104625758" ;
schema:isPartOf ebi-bsd:samples ;
schema:mainEntity _:b0 .
ebi-bsd:samples a schema:Dataset .
_:b0 a schema:Sample , obo:OBI_0000747 ;
schema:additionalProperty _:b1 , _:b2 , _:b3 ;
schema:description "Colorectal Cancer Tumor Sequenced Samaple;
schema:identifier "SAMEA104625758" ;
schema:name "P-0009062-T01-IM5" ;
schema:sameAs biosamples:SAMEA104625758 ;
schema:subjectOf "http://www.ebi.ac.uk/ena/data/view/SAMEA104625758" .
:b1 a schema:PropertyValue ;
schema:name "common name" ;
schema:value "Human" ;
schema:valueReference obo:NCBITaxon_9606 .
obo:NCBITaxon_9606 a schema:DefinedTerm .
_:b2 a schema:PropertyValue ;
schema:name "organism" ;
schema:value "Homo sapiens" ;
schema:valueReference obo:NCBITaxon_9606 .
_:b3 a schema:PropertyValue ;
schema:name "scientific name" ;
schema:value "Homo sapiens" ;
schema:valueReference obo:NCBITaxon_9606 .
我用来生成 RDF TURTLE 数据的代码如下,
我从 - https://www.ebi.ac.uk/biosamples/samples/SAMN03177689.ldjson
下载 JSONLD 中的示例数据
import org.apache.commons.io.FileUtils;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParser;
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.helpers.StatementCollector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.Scanner;
import java.util.concurrent.Callable;
public class BioSchemasRdfGenerator implements Callable<Void> {
private Logger log = LoggerFactory.getLogger(getClass());
private static File file;
private static long sampleCount = 0;
private final URL url;
public static void setFilePath(String filePath) {
file = new File(filePath);
}
BioSchemasRdfGenerator(final URL url) {
log.info("HANDLING " + url.toString() + " and the current sample count is: " + ++sampleCount);
this.url = url;
}
@Override
public Void call() throws Exception {
requestHTTPAndHandle(this.url);
return null;
}
private static void requestHTTPAndHandle(final URL url) throws Exception {
final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
int response;
try {
conn.setRequestMethod("GET");
conn.connect();
response = conn.getResponseCode();
if (response == 200) {
handleSuccessResponses(url);
}
} catch (final Exception e) {
throw new RuntimeException(e);
} finally {
conn.disconnect();
}
}
private static void handleSuccessResponses(final URL url) {
try (Scanner sc = new Scanner(url.openStream())) {
final StringBuilder sb = new StringBuilder();
while (sc.hasNext()) {
sb.append(sc.nextLine());
}
try (InputStream in = new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8))) {
String dataAsRdf = readRdfToString(in);
write(dataAsRdf);
} catch (final Exception e) {
throw new RuntimeException(e);
}
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
@SuppressWarnings(value = "deprecation")
private static void write(final String sampleData) throws Exception {
FileUtils.writeStringToFile(file, sampleData, true);
}
/**
* @param in a rdf input stream
* @return a string representation
*/
private static String readRdfToString(final InputStream in) {
return graphToString(readRdfToGraph(in));
}
/**
* @param inputStream an Input stream containing rdf data
* @return a Graph representing the rdf in the input stream
*/
private static Collection<Statement> readRdfToGraph(final InputStream inputStream) {
try {
final RDFParser rdfParser = Rio.createParser(RDFFormat.JSONLD);
final StatementCollector collector = new StatementCollector();
rdfParser.setRDFHandler(collector);
rdfParser.parse(inputStream, "");
return collector.getStatements();
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
/**
* Transforms a graph to a string.
*
* @param myGraph a sesame rdf graph
* @return a rdf string
*/
private static String graphToString(final Collection<Statement> myGraph) {
final StringWriter out = new StringWriter();
final TurtleWriterCustom turtleWriterCustom = new TurtleWriterCustom(out);
return modifyIdentifier(writeRdfInTurtleFormat(myGraph, out, turtleWriterCustom));
}
private static String modifyIdentifier(String rdfString) {
if (rdfString != null)
rdfString = rdfString.replaceAll("biosample:", "");
return rdfString;
}
private static String writeRdfInTurtleFormat(Collection<Statement> myGraph, StringWriter out, TurtleWriterCustom writer) {
try {
writer.startRDF();
handleNamespaces(writer);
for (Statement st : myGraph) {
writer.handleStatement(st);
//below line is commented: for short RDF
//writer.writeValue(st.getObject(),O true);
}
writer.endRDF();
} catch (final RDFHandlerException e) {
throw new RuntimeException(e);
}
return out.getBuffer().toString();
}
private static void handleNamespaces(final TurtleWriterCustom writer) {
writer.handleNamespace("schema", "http://schema.org/");
writer.handleNamespace("obo", "http://purl.obolibrary.org/obo/");
writer.handleNamespace("ebi-bsd", "https://www.ebi.ac.uk/biosamples/");
writer.handleNamespace("biosamples", "http://identifiers.org/biosample/");
}
}
您的代码看起来比实际需要的要复杂得多。要使用 RDF4J 将远程 URL 上的 JSON-LD 文件加载为 RDF 模型,您可以简单地执行以下操作:
String file = "https://www.ebi.ac.uk/biosamples/samples/SAMN03177689.ldjson";
try (InputStream input = new URL(file).openStream()) {
Model m = Rio.parse(input, file, RDFFormat.JSONLD);
}
如果你想用 Turtle 语法编写这个模型,你所要做的就是:
// replace System.out with your own outputstream if you want to write to file
Rio.write(m, System.out, RDFFormat.TURTLE);
如果我 运行 在您的示例文件中添加此内容,我会得到:
@prefix biosample: <http://identifiers.org/biosample/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix schema: <http://schema.org/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
biosample:SAMN03177689 a schema:DataRecord;
schema:dateCreated "2014-12-12T06:54:48.957Z"^^schema:Date;
schema:dateModified "2019-03-13T09:41:33.81Z"^^schema:Date;
schema:identifier "biosample:SAMN03177689";
schema:isPartOf <https://www.ebi.ac.uk/biosamples/samples>;
schema:mainEntity <https://www.ebi.ac.uk/biosamples/samples/SAMN03177689> .
<https://www.ebi.ac.uk/biosamples/samples> a schema:Dataset .
<https://www.ebi.ac.uk/biosamples/samples/SAMN03177689> a <http://purl.obolibrary.org/obo/OBI_0000747>,
schema:Sample;
schema:additionalProperty _:genid-2e6f9d5c4cc34db8b5ab6e72e7857e31-b0 .
_:genid-2e6f9d5c4cc34db8b5ab6e72e7857e31-b0 a schema:PropertyValue;
schema:name "INSDC center name";
schema:value "FDA" .
[snip]
注意,这里 schema:Sample
的实例有一个实际的 IRI 作为标识符,而不是空白节点:<https://www.ebi.ac.uk/biosamples/samples/SAMN03177689>
.
您的代码中发生了一些奇怪的事情。首先有这个方法modifyIdentifier
。出于某种原因,它会用空字符串删除所有出现的 biosample:
。我不确定您为什么要这样做(以这种方式操作字符串数据似乎是个坏主意)。它还以输出无效 Turtle 语法的方式执行此操作。如果在上面的示例中,您将 biosample:
替换为空字符串,您将在第 1 行得到:
@prefix <http://identifiers.org/biosample/> .
这不是一个有效的前缀定义(它在 prefix
之后缺少一个冒号)。再往下,你会
SAMN03177689 a schema:DataRecord;
这不是有效的 IRI 参考。
然后就是这个TurtleWriterCustom
class。您没有显示 class 的代码,但鉴于它的名称,我怀疑它正在尝试对输出进行一些进一步的非标准定制,这样做会弄乱您的示例标识符,以某种方式将它们替换为 (相同)空白节点。
老实说,我什至不确定您为什么要从 JSON-LD 转换为 Turtle,因为如果您的目标是将这些数据加载到 RDF 数据库中,那么您就可以执行 SPARQL 查询, 你可以直接加载 JSON-LD 文件:
Repository repo = ...; // your RDF4J database
try (RepositoryConnection conn = repo.getConnection()) {
conn.add(input, file, RDFFormat.JSONLD);
// data added to database - you can now query.
String query = "prefix schema: <http://schema.org/> "
+ "select ?r {?r a schema:DataRecord ; "
+ "schema:mainEntity [schema:additionalProperty [schema:name \"organism\" ; schema:value \"Escherichia coli\"] ] }";
conn.prepareTuplequery(query).evaluate().forEach(bs -> System.out.println(bs));
}
结果:
[r=http://identifiers.org/biosample/SAMN03177689]
我有 RDF 数据,我想形成一个 SPARQL 查询来获取与特定生物名称匹配的记录。
仅供参考,我使用 RDF4J 使用可用的 JSONLD 数据生成 RDF 记录。 我在获取匹配任何特定 PropertyValue 集的记录时遇到问题。示例:生物体为 Equus caballus 的所有记录或提交标识符为 GSB-7331 的所有记录。
非常感谢任何帮助。
数据记录如下:
@prefix schema: <http://schema.org/> .
@prefix obo: <http://purl.obolibrary.org/obo/> .
@prefix ebi-bsd: <https://www.ebi.ac.uk/biosamples/> .
@prefix biosamples: <http://identifiers.org/biosample/> .
biosamples:SAMEA104496657 a schema:DataRecord ;
schema:dateCreated "0002-10-15T00:00:00Z"^^schema:Date ;
schema:dateModified "2019-07-23T18:33:14.867Z"^^schema:Date ;
schema:identifier "SAMEA104496657" ;
schema:isPartOf ebi-bsd:samples ;
schema:mainEntity _:b0 .
ebi-bsd:samples a schema:Dataset .
_:b0 a schema:Sample , obo:OBI_0000747 ;
schema:additionalProperty _:b1 , _:b2 , _:b3 , _:b4 ;
schema:description "Blood samples N123" ;
schema:identifier "SAMEA104496657" ;
schema:name "N123" ;
schema:sameAs biosamples:SAMEA104496657 .
_:b1 a schema:PropertyValue ;
schema:name "organism" ;
schema:value "Equus caballus" ;
schema:valueReference obo:NCBITaxon_9796 .
obo:NCBITaxon_9796 a schema:DefinedTerm .
_:b2 a schema:PropertyValue ;
schema:name "submission description" ;
schema:value "ELOAD_294_samples" .
_:b3 a schema:PropertyValue ;
schema:name "submission identifier" ;
schema:value "GSB-7331" .
_:b4 a schema:PropertyValue ;
schema:name "submission title" ;
schema:value "ELOAD_294" .
@prefix schema: <http://schema.org/> .
@prefix obo: <http://purl.obolibrary.org/obo/> .
@prefix ebi-bsd: <https://www.ebi.ac.uk/biosamples/> .
@prefix biosamples: <http://identifiers.org/biosample/> .
biosamples:SAMEA104625758 a schema:DataRecord ;
schema:dateCreated "0014-06-07T00:00:00Z"^^schema:Date ;
schema:dateModified "2019-08-06T17:46:01.812Z"^^schema:Date ;
schema:identifier "SAMEA104625758" ;
schema:isPartOf ebi-bsd:samples ;
schema:mainEntity _:b0 .
ebi-bsd:samples a schema:Dataset .
_:b0 a schema:Sample , obo:OBI_0000747 ;
schema:additionalProperty _:b1 , _:b2 , _:b3 ;
schema:description "Colorectal Cancer Tumor Sequenced Samaple;
schema:identifier "SAMEA104625758" ;
schema:name "P-0009062-T01-IM5" ;
schema:sameAs biosamples:SAMEA104625758 ;
schema:subjectOf "http://www.ebi.ac.uk/ena/data/view/SAMEA104625758" .
:b1 a schema:PropertyValue ;
schema:name "common name" ;
schema:value "Human" ;
schema:valueReference obo:NCBITaxon_9606 .
obo:NCBITaxon_9606 a schema:DefinedTerm .
_:b2 a schema:PropertyValue ;
schema:name "organism" ;
schema:value "Homo sapiens" ;
schema:valueReference obo:NCBITaxon_9606 .
_:b3 a schema:PropertyValue ;
schema:name "scientific name" ;
schema:value "Homo sapiens" ;
schema:valueReference obo:NCBITaxon_9606 .
我用来生成 RDF TURTLE 数据的代码如下, 我从 - https://www.ebi.ac.uk/biosamples/samples/SAMN03177689.ldjson
下载 JSONLD 中的示例数据import org.apache.commons.io.FileUtils;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParser;
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.helpers.StatementCollector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.Scanner;
import java.util.concurrent.Callable;
public class BioSchemasRdfGenerator implements Callable<Void> {
private Logger log = LoggerFactory.getLogger(getClass());
private static File file;
private static long sampleCount = 0;
private final URL url;
public static void setFilePath(String filePath) {
file = new File(filePath);
}
BioSchemasRdfGenerator(final URL url) {
log.info("HANDLING " + url.toString() + " and the current sample count is: " + ++sampleCount);
this.url = url;
}
@Override
public Void call() throws Exception {
requestHTTPAndHandle(this.url);
return null;
}
private static void requestHTTPAndHandle(final URL url) throws Exception {
final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
int response;
try {
conn.setRequestMethod("GET");
conn.connect();
response = conn.getResponseCode();
if (response == 200) {
handleSuccessResponses(url);
}
} catch (final Exception e) {
throw new RuntimeException(e);
} finally {
conn.disconnect();
}
}
private static void handleSuccessResponses(final URL url) {
try (Scanner sc = new Scanner(url.openStream())) {
final StringBuilder sb = new StringBuilder();
while (sc.hasNext()) {
sb.append(sc.nextLine());
}
try (InputStream in = new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8))) {
String dataAsRdf = readRdfToString(in);
write(dataAsRdf);
} catch (final Exception e) {
throw new RuntimeException(e);
}
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
@SuppressWarnings(value = "deprecation")
private static void write(final String sampleData) throws Exception {
FileUtils.writeStringToFile(file, sampleData, true);
}
/**
* @param in a rdf input stream
* @return a string representation
*/
private static String readRdfToString(final InputStream in) {
return graphToString(readRdfToGraph(in));
}
/**
* @param inputStream an Input stream containing rdf data
* @return a Graph representing the rdf in the input stream
*/
private static Collection<Statement> readRdfToGraph(final InputStream inputStream) {
try {
final RDFParser rdfParser = Rio.createParser(RDFFormat.JSONLD);
final StatementCollector collector = new StatementCollector();
rdfParser.setRDFHandler(collector);
rdfParser.parse(inputStream, "");
return collector.getStatements();
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
/**
* Transforms a graph to a string.
*
* @param myGraph a sesame rdf graph
* @return a rdf string
*/
private static String graphToString(final Collection<Statement> myGraph) {
final StringWriter out = new StringWriter();
final TurtleWriterCustom turtleWriterCustom = new TurtleWriterCustom(out);
return modifyIdentifier(writeRdfInTurtleFormat(myGraph, out, turtleWriterCustom));
}
private static String modifyIdentifier(String rdfString) {
if (rdfString != null)
rdfString = rdfString.replaceAll("biosample:", "");
return rdfString;
}
private static String writeRdfInTurtleFormat(Collection<Statement> myGraph, StringWriter out, TurtleWriterCustom writer) {
try {
writer.startRDF();
handleNamespaces(writer);
for (Statement st : myGraph) {
writer.handleStatement(st);
//below line is commented: for short RDF
//writer.writeValue(st.getObject(),O true);
}
writer.endRDF();
} catch (final RDFHandlerException e) {
throw new RuntimeException(e);
}
return out.getBuffer().toString();
}
private static void handleNamespaces(final TurtleWriterCustom writer) {
writer.handleNamespace("schema", "http://schema.org/");
writer.handleNamespace("obo", "http://purl.obolibrary.org/obo/");
writer.handleNamespace("ebi-bsd", "https://www.ebi.ac.uk/biosamples/");
writer.handleNamespace("biosamples", "http://identifiers.org/biosample/");
}
}
您的代码看起来比实际需要的要复杂得多。要使用 RDF4J 将远程 URL 上的 JSON-LD 文件加载为 RDF 模型,您可以简单地执行以下操作:
String file = "https://www.ebi.ac.uk/biosamples/samples/SAMN03177689.ldjson";
try (InputStream input = new URL(file).openStream()) {
Model m = Rio.parse(input, file, RDFFormat.JSONLD);
}
如果你想用 Turtle 语法编写这个模型,你所要做的就是:
// replace System.out with your own outputstream if you want to write to file
Rio.write(m, System.out, RDFFormat.TURTLE);
如果我 运行 在您的示例文件中添加此内容,我会得到:
@prefix biosample: <http://identifiers.org/biosample/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix schema: <http://schema.org/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
biosample:SAMN03177689 a schema:DataRecord;
schema:dateCreated "2014-12-12T06:54:48.957Z"^^schema:Date;
schema:dateModified "2019-03-13T09:41:33.81Z"^^schema:Date;
schema:identifier "biosample:SAMN03177689";
schema:isPartOf <https://www.ebi.ac.uk/biosamples/samples>;
schema:mainEntity <https://www.ebi.ac.uk/biosamples/samples/SAMN03177689> .
<https://www.ebi.ac.uk/biosamples/samples> a schema:Dataset .
<https://www.ebi.ac.uk/biosamples/samples/SAMN03177689> a <http://purl.obolibrary.org/obo/OBI_0000747>,
schema:Sample;
schema:additionalProperty _:genid-2e6f9d5c4cc34db8b5ab6e72e7857e31-b0 .
_:genid-2e6f9d5c4cc34db8b5ab6e72e7857e31-b0 a schema:PropertyValue;
schema:name "INSDC center name";
schema:value "FDA" .
[snip]
注意,这里 schema:Sample
的实例有一个实际的 IRI 作为标识符,而不是空白节点:<https://www.ebi.ac.uk/biosamples/samples/SAMN03177689>
.
您的代码中发生了一些奇怪的事情。首先有这个方法modifyIdentifier
。出于某种原因,它会用空字符串删除所有出现的 biosample:
。我不确定您为什么要这样做(以这种方式操作字符串数据似乎是个坏主意)。它还以输出无效 Turtle 语法的方式执行此操作。如果在上面的示例中,您将 biosample:
替换为空字符串,您将在第 1 行得到:
@prefix <http://identifiers.org/biosample/> .
这不是一个有效的前缀定义(它在 prefix
之后缺少一个冒号)。再往下,你会
SAMN03177689 a schema:DataRecord;
这不是有效的 IRI 参考。
然后就是这个TurtleWriterCustom
class。您没有显示 class 的代码,但鉴于它的名称,我怀疑它正在尝试对输出进行一些进一步的非标准定制,这样做会弄乱您的示例标识符,以某种方式将它们替换为 (相同)空白节点。
老实说,我什至不确定您为什么要从 JSON-LD 转换为 Turtle,因为如果您的目标是将这些数据加载到 RDF 数据库中,那么您就可以执行 SPARQL 查询, 你可以直接加载 JSON-LD 文件:
Repository repo = ...; // your RDF4J database
try (RepositoryConnection conn = repo.getConnection()) {
conn.add(input, file, RDFFormat.JSONLD);
// data added to database - you can now query.
String query = "prefix schema: <http://schema.org/> "
+ "select ?r {?r a schema:DataRecord ; "
+ "schema:mainEntity [schema:additionalProperty [schema:name \"organism\" ; schema:value \"Escherichia coli\"] ] }";
conn.prepareTuplequery(query).evaluate().forEach(bs -> System.out.println(bs));
}
结果:
[r=http://identifiers.org/biosample/SAMN03177689]