Neo4j:特殊字符和时滞
Neo4j: Special characters and time lag
目标: 我正在尝试基于公开可用的 DBLP XML 文件 here 创建 DBLP 数据库的 Neo4j 实例。我将数据库建模为二分图,其中作者在一组中,出版物在另一组中。要获得 John Doe 的所有合著者,必须进行以下 Cypher 查询:
MATCH (a:Author)-[:WROTE]->(publication)<-[:WROTE]-(b:Author) WHERE a.name = "John Doe" RETURN不同的 b"
问题 1: 似乎有一个问题部分与特殊字符有关,例如 ë、æ、í 等。当我在浏览器中的地址 http://localhost:7474/browser/,输入查询 "MATCH (a:Author)-[:WROTE]->(p)<-[:WROTE]-(b:Author) WHERE a.name = "Jan Arne Telle" RETURN DISTINCT b",我应该得到 58 个唯一结果(共同作者),但我得到了 79 个结果。例如,合著者 Daniël Paulusma 被分成三个结果:"Dani"、“ë”、"l Paulusma"。但实际上,我还得到了合著者 David Keldsen 作为三个结果:"David Keldsen"、"David" 和 "Keldsen"。所以问题不仅仅与特殊字符有关。
问题 2: 上述查询的结果在 90697 毫秒内返回。
编辑:进行多次此类查询后,结果会在 2000 毫秒到 4000 毫秒内返回。
这是全部代码:
入口点:Application.java:
package std;
import java.io.File;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Transaction;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
import org.neo4j.kernel.impl.util.FileUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.data.neo4j.config.EnableNeo4jRepositories;
import org.springframework.data.neo4j.config.Neo4jConfiguration;
import org.springframework.data.neo4j.core.GraphDatabase;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.SAXException;
import org.apache.xerces.util.SecurityManager;
@SpringBootApplication
public class Application implements CommandLineRunner {
@Configuration
@EnableNeo4jRepositories(basePackages = "std")
static class ApplicationConfig extends Neo4jConfiguration {
public ApplicationConfig() {
setBasePackage("std");
}
@Bean
GraphDatabaseService graphDatabaseService() {
return new GraphDatabaseFactory().newEmbeddedDatabase("dblp.db");
}
}
@Autowired
PublicationRepository publicationRepository;
@Autowired
GraphDatabase graphDatabase;
public void run(String... args) throws Exception {
Transaction tx = graphDatabase.beginTx();
try {
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
SAXParser parser = parserFactory.newSAXParser();
SecurityManager mgr = new SecurityManager();
mgr.setEntityExpansionLimit(3100000);
parser.setProperty("http://apache.org/xml/properties/security-manager", mgr);
SaxHandler handler = new SaxHandler(publicationRepository, graphDatabase);
handler.setTransaction(tx);
parser.getXMLReader().setFeature("http://xml.org/sax/features/validation", true);
InputStream xmlInput = new FileInputStream("/Users/username/Documents/dblp.xml");
parser.parse(xmlInput, handler);
tx.success();
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParserConfigurationException e) {
e.printStackTrace();
} finally {
tx.close();
}
}
public static void main(String[] args) throws Exception {
FileUtils.deleteRecursively(new File("dblp.db"));
SpringApplication.run(Application.class, args);
}
}
Author.java:
package std;
import org.springframework.data.neo4j.annotation.GraphId;
import org.springframework.data.neo4j.annotation.Indexed;
import org.springframework.data.neo4j.annotation.NodeEntity;
import org.springframework.data.neo4j.annotation.Query;
import org.springframework.data.neo4j.support.index.IndexType;
@NodeEntity
public class Author {
@GraphId
private Long id;
@Indexed(indexName = "names", unique = true, indexType = IndexType.FULLTEXT)
private String name;
public Author() {
}
public Author(String name) {
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (this.getClass() != obj.getClass())
return false;
Author other = (Author) obj;
if (this.id != null && this.name != null && other.id != null && other.name != null) {
if (this.id.equals(other.id) && this.name.equals(other.name))
return true;
} else {
return true;
}
return false;
}
@Override
public int hashCode() {
return 31 * (this.id == null ? 1 : this.id.hashCode()) + 31 * (this.name == null ? 1 : this.name.hashCode());
}
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
Publication.java:
package std;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;
import org.neo4j.graphdb.Direction;
import org.springframework.data.neo4j.annotation.GraphId;
import org.springframework.data.neo4j.annotation.Indexed;
import org.springframework.data.neo4j.annotation.NodeEntity;
import org.springframework.data.neo4j.annotation.RelatedTo;
import org.springframework.data.neo4j.support.index.IndexType;
@NodeEntity
public class Publication implements Serializable {
private static final long serialVersionUID = -6393545300391560520L;
@GraphId
Long nodeId;
private String type = "";
private String key = "";
private String mdate = "";
private String publtype = "";
private String reviewid = "";
private String rating = "";
@RelatedTo(type = "WROTE", direction = Direction.INCOMING)
private Set<Author> authors = new HashSet<Author>();
private String editor = "";
@Indexed(indexType = IndexType.FULLTEXT, indexName = "titles")
private String title = "";
private String booktitle = "";
private String pages = "";
private String year = "";
private String address = "";
private String journal = "";
private String volume = "";
private String number = "";
private String month = "";
private String url = "";
private String ee = "";
private String cdrom = "";
private String cite = "";
private String publisher = "";
private String note = "";
private String crossref = "";
private String isbn = "";
private String series = "";
private String school = "";
private String chapter = "";
public Publication() {
}
public void addAuthor(Author author) {
authors.add(author);
}
public Set<Author> getAuthors() {
return authors;
}
public void setAuthors(Set<Author> authors) {
this.authors = authors;
}
@Override
public String toString() {
return "TYPE: " + type + "\n"
+ "KEY: " + key + "\n"
+ "MDATE: " + mdate + "\n";
}
public Long getNodeId() {
return nodeId;
}
public void setNodeId(Long nodeId) {
this.nodeId = nodeId;
}
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public String getMdate() {
return mdate;
}
public void setMdate(String mdate) {
this.mdate = mdate;
}
public String getPubltype() {
return publtype;
}
public void setPubltype(String publtype) {
this.publtype = publtype;
}
public String getReviewid() {
return reviewid;
}
public void setReviewid(String reviewid) {
this.reviewid = reviewid;
}
public String getRating() {
return rating;
}
public void setRating(String rating) {
this.rating = rating;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getEditor() {
return editor;
}
public void setEditor(String editor) {
this.editor = editor;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getBooktitle() {
return booktitle;
}
public void setBooktitle(String booktitle) {
this.booktitle = booktitle;
}
public String getPages() {
return pages;
}
public void setPages(String pages) {
this.pages = pages;
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public String getJournal() {
return journal;
}
public void setJournal(String journal) {
this.journal = journal;
}
public String getVolume() {
return volume;
}
public void setVolume(String volume) {
this.volume = volume;
}
public String getNumber() {
return number;
}
public void setNumber(String number) {
this.number = number;
}
public String getMonth() {
return month;
}
public void setMonth(String month) {
this.month = month;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getEe() {
return ee;
}
public void setEe(String ee) {
this.ee = ee;
}
public String getCdrom() {
return cdrom;
}
public void setCdrom(String cdrom) {
this.cdrom = cdrom;
}
public String getCite() {
return cite;
}
public void setCite(String cite) {
this.cite = cite;
}
public String getPublisher() {
return publisher;
}
public void setPublisher(String publisher) {
this.publisher = publisher;
}
public String getNote() {
return note;
}
public void setNote(String note) {
this.note = note;
}
public String getCrossref() {
return crossref;
}
public void setCrossref(String crossref) {
this.crossref = crossref;
}
public String getIsbn() {
return isbn;
}
public void setIsbn(String isbn) {
this.isbn = isbn;
}
public String getSeries() {
return series;
}
public void setSeries(String series) {
this.series = series;
}
public String getSchool() {
return school;
}
public void setSchool(String school) {
this.school = school;
}
public String getChapter() {
return chapter;
}
public void setChapter(String chapter) {
this.chapter = chapter;
}
}
PublicationRepository.java:
package std;
import org.springframework.data.neo4j.repository.GraphRepository;
public interface PublicationRepository extends GraphRepository<Publication> {
Publication findByTitle(String title);
}
SaxHandler.java:
package std;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import org.neo4j.graphdb.Transaction;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.neo4j.core.GraphDatabase;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class SaxHandler extends DefaultHandler {
private Stack<String> qNameStack = new Stack<String>();
private Stack<Publication> publicationStack = new Stack<Publication>();
private String publicationType = null;
private PublicationRepository publicationRepository = null;
private Publication publication = null;
private Author author = null;
private String currentElement = null;
private String value = null;
private boolean insideTitle = false;
private GraphDatabase graphDatabase;
private Transaction tx = null;
private static int counter = 0;
public List<Publication> getPublications() {
return publications;
}
@Autowired
public SaxHandler(PublicationRepository publicationRepository, GraphDatabase graphDatabase) {
this.publicationRepository = publicationRepository;
this.graphDatabase = graphDatabase;
}
public void setTransaction(Transaction tx) {
this.tx = tx;
}
public void startElement(String uri, String localName, String tagName, Attributes attributes) throws SAXException {
storeTagName(tagName);
createEmptyPublication();
testIfEnteringTitle(tagName);
testIfPublicationTag(tagName);
testOnAttributes(tagName, attributes);
}
public void endElement(String uri, String localName, String tagName) throws SAXException {
testIfLeavingTitle(tagName);
removeNameOfLastVisitedTag();
testIfFinishedCreatingPublication(tagName);
}
public void characters(char ch[], int start, int length) throws SAXException {
storeContentsInCurrentPublication(ch, start, length);
}
/**
* Store the contents of the current tag in the corresponding field
* of the current publication.
*
* @param ch
* @param start
* @param length
*/
private void storeContentsInCurrentPublication(char ch[], int start, int length) {
value = new String(ch,start,length).trim();
if (value.length() == 0)
return;
publication = publicationStack.peek();
currentElement = qNameStack.peek();
if ("author".equals(currentElement)) {
author = new Author();
author.setName(value);
publication.addAuthor(author);
} else if ("editor".equals(currentElement)) {
publication.setEditor(value);
} else if ("title".equals(currentElement)) {
String title = publication.getTitle() + value;
publication.setTitle(title);
} else if ("booktitle".equals(currentElement)) {
publication.setBooktitle(value);
} else if ("pages".equals(currentElement)) {
publication.setPages(value);
} else if ("year".equals(currentElement)) {
publication.setYear(value);
} else if ("address".equals(currentElement)) {
publication.setAddress(value);
} else if ("journal".equals(currentElement)) {
publication.setJournal(value);
} else if ("volume".equals(currentElement)) {
publication.setVolume(value);
} else if ("number".equals(currentElement)) {
publication.setNumber(value);
} else if ("month".equals(currentElement)) {
publication.setMonth(value);
} else if ("url".equals(currentElement)) {
publication.setUrl(value);
} else if ("ee".equals(currentElement)) {
publication.setEe(value);
} else if ("cdrom".equals(currentElement)) {
publication.setCdrom(value);
} else if ("cite".equals(currentElement)) {
publication.setCite(value);
} else if ("publisher".equals(currentElement)) {
publication.setPublisher(value);
} else if ("note".equals(currentElement)) {
publication.setNote(value);
} else if ("crossref".equals(currentElement)) {
publication.setCrossref(value);
} else if ("isbn".equals(currentElement)) {
publication.setIsbn(value);
} else if ("series".equals(currentElement)) {
publication.setSeries(value);
} else if ("school".equals(currentElement)) {
publication.setSchool(value);
} else if ("chapter".equals(currentElement)) {
publication.setChapter(value);
} else if ("i".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<i>" + value + "</i>";
publication.setTitle(title);
} else if ("sup".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<sup>" + value + "</sup>";
publication.setTitle(title);
} else if ("sub".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<sub>" + value + "</sub>";
publication.setTitle(title);
} else if ("tt".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<tt>" + value + "</tt>";
publication.setTitle(title);
} else if ("ref".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<ref>" + value + "</ref>";
publication.setTitle(title);
}
}
/**
* Returns true if and only if the parser is inside
* either a title or booktitle tag.
*
* @return true if and only if the parser is inside
* either a title or booktitle tag.
*/
private boolean isInsideTitleOrBooktitle() {
return insideTitle;
}
/**
* Checks if the parser is finished with one whole
* publication. If so, the publication is stored in
* the database.
*
* @param tagName
*/
private void testIfFinishedCreatingPublication(String tagName) {
if (publicationType.equals(tagName)) {
publicationRepository.save(publicationStack.pop());
if (++counter % 1000 == 0) {
System.out.println("Counter = " + counter);
tx.success();
tx.close();
tx = graphDatabase.beginTx();
}
}
}
/**
* Removes the tag name of the last visited tag
* from the stack.
*/
private void removeNameOfLastVisitedTag() {
qNameStack.pop();
}
/**
* Store the tag name on the stack.
*
* @param tagName
*/
private void storeTagName(String tagName) {
qNameStack.push(tagName);
}
/**
* Create an empty publication to be filled with data.
*/
private void createEmptyPublication() {
publication = new Publication();
}
/**
* Checks if the parser is entering a title or booktitle tag. If so
* is the case, then a boolean flag is set.
*
* @param tagName the name of the current tag
*/
private void testIfLeavingTitle(String tagName) {
if ("title".equals(tagName) || "booktitle".equals(tagName))
insideTitle = false;
}
/**
* Checks if the parser is entering a title or booktitle tag. If so
* is the case, then a boolean flag is set.
*
* @param tagName the name of the current tag
*/
private void testIfEnteringTitle(String tagName) {
if ("title".equals(tagName) || "booktitle".equals(tagName))
insideTitle = true;
}
/**
* Checks if the current tag is one of:
* - article, inproceedings, proceedings, book, incollection, phdthesis, mastersthesis, www
* If the current tag is one of these, then the type of the current publication is set
* to the corresponding value.
*
* @param tagName the name of the current tag.
*/
private void testIfPublicationTag(String tagName) {
if ("article".equals(tagName)) {
publication.setType("article");
} else if ("inproceedings".equals(tagName)) {
publication.setType("inproceedings");
} else if ("proceedings".equals(tagName)) {
publication.setType("proceedings");
} else if ("book".equals(tagName)) {
publication.setType("book");
} else if ("incollection".equals(tagName)) {
publication.setType("incollection");
} else if ("phdthesis".equals(tagName)) {
publication.setType("phdthesis");
} else if ("mastersthesis".equals(tagName)) {
publication.setType("mastersthesis");
} else if ("www".equals(tagName)) {
publication.setType("www");
}
}
/**
* Checks if the tag has any attributes. If so, the existing attribute
* values are stored.
*
* A tag with attributes is one of:
* - article, inproceedings, proceedings, book, incollection, phdthesis, mastersthesis, www
*
* @param tagName the name of the current tag
* @param attributes the attributes of the current tag, if any
*/
private void testOnAttributes(String tagName, Attributes attributes) {
if (attributes.getLength() > 0) {
publicationType = tagName;
if (attributes.getValue("key") != null) {
publication.setKey(attributes.getValue("key"));
}
if (attributes.getValue("mdate") != null) {
publication.setMdate(attributes.getValue("mdate"));
}
if (attributes.getValue("publtype") != null) {
publication.setMdate(attributes.getValue("publtype"));
}
if (attributes.getValue("reviewid") != null) {
publication.setMdate(attributes.getValue("reviewid"));
}
if (attributes.getValue("rating") != null) {
publication.setMdate(attributes.getValue("rating"));
}
publicationStack.push(publication);
}
}
}
pom.xml:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.dblp</groupId>
<artifactId>graphdbcreator</artifactId>
<version>0.1.0</version>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.2.2.RELEASE</version>
</parent>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-tx</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-neo4j</artifactId>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-validator</artifactId>
</dependency>
<dependency>
<groupId>javax.el</groupId>
<artifactId>javax.el-api</artifactId>
<version>2.2.4</version>
</dependency>
<dependency>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
<version>2.8.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>spring-releases</id>
<name>Spring Releases</name>
<url>https://repo.spring.io/libs-release</url>
</repository>
<repository>
<id>neo4j</id>
<name>Neo4j</name>
<url>http://m2.neo4j.org/</url>
</repository>
</repositories>
</project>
对于问题 1,请尝试使用适合您需要的分析器设置手动索引。有关如何使用自定义分析器的详细信息,请参阅 http://blog.armbruster-it.de/2014/10/deep-dive-on-fulltext-indexing-with-neo4j/。
另一种选择是在应用程序端使用词干逻辑并将词干名称存储在辅助 属性。
第三个选项是在引用同一个人的作者节点之间添加 "SIMILAR" 关系。
关于问题 2:确保为作者的姓名 属性 建立索引:
CREATE INDEX ON :Author(name)
后续调用的查询时间差异很容易用缓存来解释,阅读更多http://neo4j.com/docs/stable/configuration-caches.html
我的 SAX 处理程序似乎有缺陷。例如,给定一个标记 <author>Daniël Paulusma</author>
,解析器会为 "Dani" 调用 characters() 方法一次,为“ë”调用 characters() 的另一次调用,以及对 characters() 的第三次调用"l Paulusma"。我在这里找到了解决此问题的简单方法:SAX parsing and special characters.
目标: 我正在尝试基于公开可用的 DBLP XML 文件 here 创建 DBLP 数据库的 Neo4j 实例。我将数据库建模为二分图,其中作者在一组中,出版物在另一组中。要获得 John Doe 的所有合著者,必须进行以下 Cypher 查询:
MATCH (a:Author)-[:WROTE]->(publication)<-[:WROTE]-(b:Author) WHERE a.name = "John Doe" RETURN不同的 b"
问题 1: 似乎有一个问题部分与特殊字符有关,例如 ë、æ、í 等。当我在浏览器中的地址 http://localhost:7474/browser/,输入查询 "MATCH (a:Author)-[:WROTE]->(p)<-[:WROTE]-(b:Author) WHERE a.name = "Jan Arne Telle" RETURN DISTINCT b",我应该得到 58 个唯一结果(共同作者),但我得到了 79 个结果。例如,合著者 Daniël Paulusma 被分成三个结果:"Dani"、“ë”、"l Paulusma"。但实际上,我还得到了合著者 David Keldsen 作为三个结果:"David Keldsen"、"David" 和 "Keldsen"。所以问题不仅仅与特殊字符有关。
问题 2: 上述查询的结果在 90697 毫秒内返回。
编辑:进行多次此类查询后,结果会在 2000 毫秒到 4000 毫秒内返回。
这是全部代码:
入口点:Application.java:
package std;
import java.io.File;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Transaction;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
import org.neo4j.kernel.impl.util.FileUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.data.neo4j.config.EnableNeo4jRepositories;
import org.springframework.data.neo4j.config.Neo4jConfiguration;
import org.springframework.data.neo4j.core.GraphDatabase;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.SAXException;
import org.apache.xerces.util.SecurityManager;
@SpringBootApplication
public class Application implements CommandLineRunner {
@Configuration
@EnableNeo4jRepositories(basePackages = "std")
static class ApplicationConfig extends Neo4jConfiguration {
public ApplicationConfig() {
setBasePackage("std");
}
@Bean
GraphDatabaseService graphDatabaseService() {
return new GraphDatabaseFactory().newEmbeddedDatabase("dblp.db");
}
}
@Autowired
PublicationRepository publicationRepository;
@Autowired
GraphDatabase graphDatabase;
public void run(String... args) throws Exception {
Transaction tx = graphDatabase.beginTx();
try {
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
SAXParser parser = parserFactory.newSAXParser();
SecurityManager mgr = new SecurityManager();
mgr.setEntityExpansionLimit(3100000);
parser.setProperty("http://apache.org/xml/properties/security-manager", mgr);
SaxHandler handler = new SaxHandler(publicationRepository, graphDatabase);
handler.setTransaction(tx);
parser.getXMLReader().setFeature("http://xml.org/sax/features/validation", true);
InputStream xmlInput = new FileInputStream("/Users/username/Documents/dblp.xml");
parser.parse(xmlInput, handler);
tx.success();
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParserConfigurationException e) {
e.printStackTrace();
} finally {
tx.close();
}
}
public static void main(String[] args) throws Exception {
FileUtils.deleteRecursively(new File("dblp.db"));
SpringApplication.run(Application.class, args);
}
}
Author.java:
package std;
import org.springframework.data.neo4j.annotation.GraphId;
import org.springframework.data.neo4j.annotation.Indexed;
import org.springframework.data.neo4j.annotation.NodeEntity;
import org.springframework.data.neo4j.annotation.Query;
import org.springframework.data.neo4j.support.index.IndexType;
@NodeEntity
public class Author {
@GraphId
private Long id;
@Indexed(indexName = "names", unique = true, indexType = IndexType.FULLTEXT)
private String name;
public Author() {
}
public Author(String name) {
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (this.getClass() != obj.getClass())
return false;
Author other = (Author) obj;
if (this.id != null && this.name != null && other.id != null && other.name != null) {
if (this.id.equals(other.id) && this.name.equals(other.name))
return true;
} else {
return true;
}
return false;
}
@Override
public int hashCode() {
return 31 * (this.id == null ? 1 : this.id.hashCode()) + 31 * (this.name == null ? 1 : this.name.hashCode());
}
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
Publication.java:
package std;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;
import org.neo4j.graphdb.Direction;
import org.springframework.data.neo4j.annotation.GraphId;
import org.springframework.data.neo4j.annotation.Indexed;
import org.springframework.data.neo4j.annotation.NodeEntity;
import org.springframework.data.neo4j.annotation.RelatedTo;
import org.springframework.data.neo4j.support.index.IndexType;
@NodeEntity
public class Publication implements Serializable {
private static final long serialVersionUID = -6393545300391560520L;
@GraphId
Long nodeId;
private String type = "";
private String key = "";
private String mdate = "";
private String publtype = "";
private String reviewid = "";
private String rating = "";
@RelatedTo(type = "WROTE", direction = Direction.INCOMING)
private Set<Author> authors = new HashSet<Author>();
private String editor = "";
@Indexed(indexType = IndexType.FULLTEXT, indexName = "titles")
private String title = "";
private String booktitle = "";
private String pages = "";
private String year = "";
private String address = "";
private String journal = "";
private String volume = "";
private String number = "";
private String month = "";
private String url = "";
private String ee = "";
private String cdrom = "";
private String cite = "";
private String publisher = "";
private String note = "";
private String crossref = "";
private String isbn = "";
private String series = "";
private String school = "";
private String chapter = "";
public Publication() {
}
public void addAuthor(Author author) {
authors.add(author);
}
public Set<Author> getAuthors() {
return authors;
}
public void setAuthors(Set<Author> authors) {
this.authors = authors;
}
@Override
public String toString() {
return "TYPE: " + type + "\n"
+ "KEY: " + key + "\n"
+ "MDATE: " + mdate + "\n";
}
public Long getNodeId() {
return nodeId;
}
public void setNodeId(Long nodeId) {
this.nodeId = nodeId;
}
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public String getMdate() {
return mdate;
}
public void setMdate(String mdate) {
this.mdate = mdate;
}
public String getPubltype() {
return publtype;
}
public void setPubltype(String publtype) {
this.publtype = publtype;
}
public String getReviewid() {
return reviewid;
}
public void setReviewid(String reviewid) {
this.reviewid = reviewid;
}
public String getRating() {
return rating;
}
public void setRating(String rating) {
this.rating = rating;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getEditor() {
return editor;
}
public void setEditor(String editor) {
this.editor = editor;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getBooktitle() {
return booktitle;
}
public void setBooktitle(String booktitle) {
this.booktitle = booktitle;
}
public String getPages() {
return pages;
}
public void setPages(String pages) {
this.pages = pages;
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public String getJournal() {
return journal;
}
public void setJournal(String journal) {
this.journal = journal;
}
public String getVolume() {
return volume;
}
public void setVolume(String volume) {
this.volume = volume;
}
public String getNumber() {
return number;
}
public void setNumber(String number) {
this.number = number;
}
public String getMonth() {
return month;
}
public void setMonth(String month) {
this.month = month;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getEe() {
return ee;
}
public void setEe(String ee) {
this.ee = ee;
}
public String getCdrom() {
return cdrom;
}
public void setCdrom(String cdrom) {
this.cdrom = cdrom;
}
public String getCite() {
return cite;
}
public void setCite(String cite) {
this.cite = cite;
}
public String getPublisher() {
return publisher;
}
public void setPublisher(String publisher) {
this.publisher = publisher;
}
public String getNote() {
return note;
}
public void setNote(String note) {
this.note = note;
}
public String getCrossref() {
return crossref;
}
public void setCrossref(String crossref) {
this.crossref = crossref;
}
public String getIsbn() {
return isbn;
}
public void setIsbn(String isbn) {
this.isbn = isbn;
}
public String getSeries() {
return series;
}
public void setSeries(String series) {
this.series = series;
}
public String getSchool() {
return school;
}
public void setSchool(String school) {
this.school = school;
}
public String getChapter() {
return chapter;
}
public void setChapter(String chapter) {
this.chapter = chapter;
}
}
PublicationRepository.java:
package std;
import org.springframework.data.neo4j.repository.GraphRepository;
public interface PublicationRepository extends GraphRepository<Publication> {
Publication findByTitle(String title);
}
SaxHandler.java:
package std;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
import org.neo4j.graphdb.Transaction;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.neo4j.core.GraphDatabase;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class SaxHandler extends DefaultHandler {
private Stack<String> qNameStack = new Stack<String>();
private Stack<Publication> publicationStack = new Stack<Publication>();
private String publicationType = null;
private PublicationRepository publicationRepository = null;
private Publication publication = null;
private Author author = null;
private String currentElement = null;
private String value = null;
private boolean insideTitle = false;
private GraphDatabase graphDatabase;
private Transaction tx = null;
private static int counter = 0;
public List<Publication> getPublications() {
return publications;
}
@Autowired
public SaxHandler(PublicationRepository publicationRepository, GraphDatabase graphDatabase) {
this.publicationRepository = publicationRepository;
this.graphDatabase = graphDatabase;
}
public void setTransaction(Transaction tx) {
this.tx = tx;
}
public void startElement(String uri, String localName, String tagName, Attributes attributes) throws SAXException {
storeTagName(tagName);
createEmptyPublication();
testIfEnteringTitle(tagName);
testIfPublicationTag(tagName);
testOnAttributes(tagName, attributes);
}
public void endElement(String uri, String localName, String tagName) throws SAXException {
testIfLeavingTitle(tagName);
removeNameOfLastVisitedTag();
testIfFinishedCreatingPublication(tagName);
}
public void characters(char ch[], int start, int length) throws SAXException {
storeContentsInCurrentPublication(ch, start, length);
}
/**
* Store the contents of the current tag in the corresponding field
* of the current publication.
*
* @param ch
* @param start
* @param length
*/
private void storeContentsInCurrentPublication(char ch[], int start, int length) {
value = new String(ch,start,length).trim();
if (value.length() == 0)
return;
publication = publicationStack.peek();
currentElement = qNameStack.peek();
if ("author".equals(currentElement)) {
author = new Author();
author.setName(value);
publication.addAuthor(author);
} else if ("editor".equals(currentElement)) {
publication.setEditor(value);
} else if ("title".equals(currentElement)) {
String title = publication.getTitle() + value;
publication.setTitle(title);
} else if ("booktitle".equals(currentElement)) {
publication.setBooktitle(value);
} else if ("pages".equals(currentElement)) {
publication.setPages(value);
} else if ("year".equals(currentElement)) {
publication.setYear(value);
} else if ("address".equals(currentElement)) {
publication.setAddress(value);
} else if ("journal".equals(currentElement)) {
publication.setJournal(value);
} else if ("volume".equals(currentElement)) {
publication.setVolume(value);
} else if ("number".equals(currentElement)) {
publication.setNumber(value);
} else if ("month".equals(currentElement)) {
publication.setMonth(value);
} else if ("url".equals(currentElement)) {
publication.setUrl(value);
} else if ("ee".equals(currentElement)) {
publication.setEe(value);
} else if ("cdrom".equals(currentElement)) {
publication.setCdrom(value);
} else if ("cite".equals(currentElement)) {
publication.setCite(value);
} else if ("publisher".equals(currentElement)) {
publication.setPublisher(value);
} else if ("note".equals(currentElement)) {
publication.setNote(value);
} else if ("crossref".equals(currentElement)) {
publication.setCrossref(value);
} else if ("isbn".equals(currentElement)) {
publication.setIsbn(value);
} else if ("series".equals(currentElement)) {
publication.setSeries(value);
} else if ("school".equals(currentElement)) {
publication.setSchool(value);
} else if ("chapter".equals(currentElement)) {
publication.setChapter(value);
} else if ("i".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<i>" + value + "</i>";
publication.setTitle(title);
} else if ("sup".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<sup>" + value + "</sup>";
publication.setTitle(title);
} else if ("sub".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<sub>" + value + "</sub>";
publication.setTitle(title);
} else if ("tt".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<tt>" + value + "</tt>";
publication.setTitle(title);
} else if ("ref".equals(currentElement) && isInsideTitleOrBooktitle()) {
String title = publication.getTitle() + "<ref>" + value + "</ref>";
publication.setTitle(title);
}
}
/**
* Returns true if and only if the parser is inside
* either a title or booktitle tag.
*
* @return true if and only if the parser is inside
* either a title or booktitle tag.
*/
private boolean isInsideTitleOrBooktitle() {
return insideTitle;
}
/**
* Checks if the parser is finished with one whole
* publication. If so, the publication is stored in
* the database.
*
* @param tagName
*/
private void testIfFinishedCreatingPublication(String tagName) {
if (publicationType.equals(tagName)) {
publicationRepository.save(publicationStack.pop());
if (++counter % 1000 == 0) {
System.out.println("Counter = " + counter);
tx.success();
tx.close();
tx = graphDatabase.beginTx();
}
}
}
/**
* Removes the tag name of the last visited tag
* from the stack.
*/
private void removeNameOfLastVisitedTag() {
qNameStack.pop();
}
/**
* Store the tag name on the stack.
*
* @param tagName
*/
private void storeTagName(String tagName) {
qNameStack.push(tagName);
}
/**
* Create an empty publication to be filled with data.
*/
private void createEmptyPublication() {
publication = new Publication();
}
/**
* Checks if the parser is entering a title or booktitle tag. If so
* is the case, then a boolean flag is set.
*
* @param tagName the name of the current tag
*/
private void testIfLeavingTitle(String tagName) {
if ("title".equals(tagName) || "booktitle".equals(tagName))
insideTitle = false;
}
/**
* Checks if the parser is entering a title or booktitle tag. If so
* is the case, then a boolean flag is set.
*
* @param tagName the name of the current tag
*/
private void testIfEnteringTitle(String tagName) {
if ("title".equals(tagName) || "booktitle".equals(tagName))
insideTitle = true;
}
/**
* Checks if the current tag is one of:
* - article, inproceedings, proceedings, book, incollection, phdthesis, mastersthesis, www
* If the current tag is one of these, then the type of the current publication is set
* to the corresponding value.
*
* @param tagName the name of the current tag.
*/
private void testIfPublicationTag(String tagName) {
if ("article".equals(tagName)) {
publication.setType("article");
} else if ("inproceedings".equals(tagName)) {
publication.setType("inproceedings");
} else if ("proceedings".equals(tagName)) {
publication.setType("proceedings");
} else if ("book".equals(tagName)) {
publication.setType("book");
} else if ("incollection".equals(tagName)) {
publication.setType("incollection");
} else if ("phdthesis".equals(tagName)) {
publication.setType("phdthesis");
} else if ("mastersthesis".equals(tagName)) {
publication.setType("mastersthesis");
} else if ("www".equals(tagName)) {
publication.setType("www");
}
}
/**
* Checks if the tag has any attributes. If so, the existing attribute
* values are stored.
*
* A tag with attributes is one of:
* - article, inproceedings, proceedings, book, incollection, phdthesis, mastersthesis, www
*
* @param tagName the name of the current tag
* @param attributes the attributes of the current tag, if any
*/
private void testOnAttributes(String tagName, Attributes attributes) {
if (attributes.getLength() > 0) {
publicationType = tagName;
if (attributes.getValue("key") != null) {
publication.setKey(attributes.getValue("key"));
}
if (attributes.getValue("mdate") != null) {
publication.setMdate(attributes.getValue("mdate"));
}
if (attributes.getValue("publtype") != null) {
publication.setMdate(attributes.getValue("publtype"));
}
if (attributes.getValue("reviewid") != null) {
publication.setMdate(attributes.getValue("reviewid"));
}
if (attributes.getValue("rating") != null) {
publication.setMdate(attributes.getValue("rating"));
}
publicationStack.push(publication);
}
}
}
pom.xml:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.dblp</groupId>
<artifactId>graphdbcreator</artifactId>
<version>0.1.0</version>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.2.2.RELEASE</version>
</parent>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-tx</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-neo4j</artifactId>
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
<artifactId>hibernate-validator</artifactId>
</dependency>
<dependency>
<groupId>javax.el</groupId>
<artifactId>javax.el-api</artifactId>
<version>2.2.4</version>
</dependency>
<dependency>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
<version>2.8.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>spring-releases</id>
<name>Spring Releases</name>
<url>https://repo.spring.io/libs-release</url>
</repository>
<repository>
<id>neo4j</id>
<name>Neo4j</name>
<url>http://m2.neo4j.org/</url>
</repository>
</repositories>
</project>
对于问题 1,请尝试使用适合您需要的分析器设置手动索引。有关如何使用自定义分析器的详细信息,请参阅 http://blog.armbruster-it.de/2014/10/deep-dive-on-fulltext-indexing-with-neo4j/。
另一种选择是在应用程序端使用词干逻辑并将词干名称存储在辅助 属性。
第三个选项是在引用同一个人的作者节点之间添加 "SIMILAR" 关系。
关于问题 2:确保为作者的姓名 属性 建立索引:
CREATE INDEX ON :Author(name)
后续调用的查询时间差异很容易用缓存来解释,阅读更多http://neo4j.com/docs/stable/configuration-caches.html
我的 SAX 处理程序似乎有缺陷。例如,给定一个标记 <author>Daniël Paulusma</author>
,解析器会为 "Dani" 调用 characters() 方法一次,为“ë”调用 characters() 的另一次调用,以及对 characters() 的第三次调用"l Paulusma"。我在这里找到了解决此问题的简单方法:SAX parsing and special characters.