
stanford core nlp 是一个用于nlp的工具库。它是用java写的,但是现在也为python提供了接口。前段时间笔者尝试在python中使用它:


from stanfordcorenlp import StanfordCoreNLP

stanfordcorenlp 中只有 StanfordCoreNLP 一个类

2、获得StanfordCoreNLP 的对象

创建StanfordCoreNLP 对象需要传入一个路径参数,从而获得一个存放相应jar包的文件夹:该文件夹下载地址:https://stanfordnlp.github.io/CoreNLP/download.html

nlp = StanfordCoreNLP(path)  # 这里的path即是stanford-corenlp-full-2016-10-31 的路径



from stanfordcorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP(path)
sentence = "i 've had the player for about 2 years now and it still performs nicely with the exception of an occasional wwhhhrrr sound from the motor ."


$ python WordFormation.py
Traceback (most recent call last):
  File "WordFormation.py", line 1, in <module>
    from stanfordcorenlp import StanfordCoreNLP
ModuleNotFoundError: No module named 'stanfordcorenlp'


PermissionError: [Errno 1] Operation not permitted


$ sudo python WordFormation.py
[('ROOT', 0, 3), ('nsubj', 3, 1), ('aux', 3, 2), ('det', 5, 4), ('dobj', 3, 5), ('case', 9, 6), ('advmod', 8, 7), ('nummod', 9, 8), ('nmod', 5, 9), ('advmod', 3, 10), ('cc', 3, 11), ('nsubj', 14, 12), ('advmod', 14, 13), ('conj', 3, 14), ('advmod', 14, 15), ('case', 18, 16), ('det', 18, 17), ('nmod', 14, 18), ('case', 23, 19), ('det', 23, 20), ('amod', 23, 21), ('compound', 23, 22), ('nmod', 18, 23), ('case', 26, 24), ('det', 26, 25), ('nmod', 23, 26), ('punct', 3, 27)]

其中的那些数字代表的是第几个单词,但是它是从1开始数的,(‘ROOT’, 0, 3) 中的0不代表sentence中的单词

StanfordCoreNLP 还有一些功能,比如词性标注等都可以使用

但是笔者没有从StanfordCoreNLP 类中获得可以进一步获得dependency的方法:比如复合名词修饰 nmod 在这里我只能获得 nmod 而不能获得修饰用的介词 nmod:for 的形式



java 的话,语句相应会复杂一些

pom.xml 中加入:






import edu.stanford.nlp.ling.CoreAnnotations;
import java.util.Properties;
public class StanfordEnglishNlpExample {
    public static void main(String[] args) {
        Properties props = new Properties();
        // 设置相应的properties
        props.put("annotators", "tokenize,ssplit,pos,parse,depparse");
        props.put("tokenize.options", "ptb3Escaping=false");
        props.put("parse.maxlen", "10000");
        props.put("depparse.extradependencies", "SUBJ_ONLY");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // 获得StanfordCoreNLP 对象

        String str = "i 've had the player for about 2 years now and it still performs nicely with the exception of an occasional wwhhhrrr sound from the motor .";
        Annotation document = new Annotation(str);
        CoreMap sentence = document.get(CoreAnnotations.SentencesAnnotation.class).get(0);
        SemanticGraph dependency_graph = sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);  // 获得依赖关系图
        System.out.println("\n\nDependency Graph: " + dependency_graph.toString(SemanticGraph.OutputFormat.LIST));
    }// 直接打印关系


Dependency Graph: root(ROOT-0, had-3)
nsubj(had-3, i-1)
aux(had-3, 've-2)
det(player-5, the-4)
dobj(had-3, player-5)
case(years-9, for-6)
advmod(2-8, about-7)
nummod(years-9, 2-8)
nmod:for(player-5, years-9)
advmod(had-3, now-10)
cc(had-3, and-11)
nsubj(performs-14, it-12)
advmod(performs-14, still-13)
conj:and(had-3, performs-14)
advmod(performs-14, nicely-15)
case(exception-18, with-16)
det(exception-18, the-17)
nmod:with(performs-14, exception-18)
case(sound-23, of-19)
det(sound-23, an-20)
amod(sound-23, occasional-21)
compound(sound-23, wwhhhrrr-22)
nmod:of(exception-18, sound-23)
case(motor-26, from-24)
det(motor-26, the-25)
nmod:from(sound-23, motor-26)
punct(had-3, .-27)

在这里面就可以看到nmod:with nmod:of 这样的依存关系了

对于以上代码中的SemanticGraph 对象 dependency_graph 来说

List<SemanticGraphEdge> list = dependencies.edgeListSorted();

实际上,如果想要root 关系,只能通过从 dependency_graph 再获取root关系列表,这样的话,没有很好的顺序关系

想要使用词性标注器,首先需要获得english-left3words-distsim.tagger文件,这个文件在stanford-corenlp-2016-10-31 中有,可以直接用。但是很有可能由于引用的jar包和使用的tagger文件的版本不一致导致错误。


URL url = new URL("jar:file:"+ path +
                "!/edu/stanford/nlp/models/pos-tagger/english-left3words/" +
# 这里的path是jar包的路径,!后面的是tagger文件在jar包内部路径
JarURLConnection jarURLConnection = (JarURLConnection) url.openConnection();

由于词性标注器,MaxentTagger 类构造器,可以传入路径,也可以传入InputStream 对象:

MaxentTagger tagger = new MaxentTagger(jarURLConnection.getInputStream());


    public static void main(String[] args) throws java.net.MalformedURLException, IOException {

        URL url = new URL("jar:file:"+ path +
                "!/edu/stanford/nlp/models/pos-tagger/english-left3words/" +
        JarURLConnection jarURLConnection = (JarURLConnection) url.openConnection();

        MaxentTagger tagger = new MaxentTagger(jarURLConnection.getInputStream());
        DependencyParser parser = DependencyParser.loadFromModelFile(DependencyParser.DEFAULT_MODEL); // 依存关系解析器

        String review = "i 've had the player for about 2 years now and it still performs nicely with the exception of an occasional wwhhhrrr sound from the motor .";
        String result = "[";
        DocumentPreprocessor tockenizer = new DocumentPreprocessor(new StringReader(review)); // 将一段话,分成多个句子
        for(List<HasWord> sentence: tockenizer){
            List<TaggedWord> tagged = tagger.tagSentence(sentence); // 对句子中的词打标签
            GrammaticalStructure gs = parser.predict(tagged);
            List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); // 获得依赖关系
            for(TypedDependency td: tdl){
                result = result.concat(td.reln()+"("+td.gov()+", "+td.dep()+"),");


[nsubj(had/VBD, i/FW),aux(had/VBD, 've/VBP),root(ROOT, had/VBD),det(player/NN, the/DT),dobj(had/VBD, player/NN),case(years/NNS, for/IN),advmod(2/CD, about/IN),nummod(years/NNS, 2/CD),nmod:for(player/NN, years/NNS),advmod(had/VBD, now/RB),cc(had/VBD, and/CC),nsubj(performs/VBZ, it/PRP),advmod(performs/VBZ, still/RB),conj:and(had/VBD, performs/VBZ),advmod(performs/VBZ, nicely/RB),case(exception/NN, with/IN),det(exception/NN, the/DT),nmod:with(performs/VBZ, exception/NN),case(sound/NN, of/IN),det(sound/NN, an/DT),amod(sound/NN, occasional/JJ),compound(sound/NN, wwhhhrrr/NN),nmod:of(exception/NN, sound/NN),case(motor/NN, from/IN),det(motor/NN, the/DT),nmod:from(sound/NN, motor/NN),punct(had/VBD, ./.)]


import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

import java.io.IOException;
import java.util.List;
import java.util.Properties;

 * Annotations、Annotators
 * 这两种类是CoreNLP里面的基本架构。
 *      Annotations: 表示一类数据结构,CoreNLP的整个工具包里面输入和输出都是这种数据结构,因此我们自己的文本(一般是String类型)要传给CoreNLP使用,需要先转换成一种Annotations。
 *      Annotators: 是一类功能类,比如我们想分词或者断句等,这每一项功能对应一种annotator类,annotator类接受一种annotation作为输入,然后输出一种annotation。
 * */
public class StanfordEnglishNlpExample {
    public static void main(String[] args) throws IOException {
        StanfordEnglishNlpExample example = new StanfordEnglishNlpExample();

    public void runAllAnnotators() throws IOException {
        // set up pipeline properties
        Properties props = new Properties();
        // 设置相应的properties
        props.setProperty("annotators", "tokenize, ssplit");// props.put("annotators", "tokenize, ssplit"); tokenize: 分词; ssplit: 断句【tokenize,ssplit,pos,lemma,ner,parse,depparse,coref,kbp,quote】

        // 获得StanfordCoreNLP 对象
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

        String str = "It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons.";
        Annotation document = new Annotation(str);

        // run all Annotators on this text


    public void parserOutput(Annotation document) {
        // these are all the sentences in this document
        // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
        List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
        System.out.println("\nsentences = " + sentences);
        //从CoreMap 中取出CoreLabel List
        for (CoreMap sentence : sentences) {
            System.out.println("\nsentence = " + sentence);
            // traversing the words in the current sentence: a CoreLabel is a CoreMap with additional token-specific methods
            for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
                // this is the text of the token
                String word = token.get(CoreAnnotations.TextAnnotation.class);
                System.out.println("word = " + word);


sentences = [It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria., Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons.]

sentence = It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria.
word = It
word = 's
word = official
word = :
word = U.S.
word = President
word = Barack
word = Obama
word = wants
word = lawmakers
word = to
word = weigh
word = in
word = on
word = whether
word = to
word = use
word = military
word = force
word = in
word = Syria
word = .

sentence = Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons.
word = Obama
word = sent
word = a
word = letter
word = to
word = the
word = heads
word = of
word = the
word = House
word = and
word = Senate
word = on
word = Saturday
word = night
word = ,
word = hours
word = after
word = announcing
word = that
word = he
word = believes
word = military
word = action
word = against
word = Syrian
word = targets
word = is
word = the
word = right
word = step
word = to
word = take
word = over
word = the
word = alleged
word = use
word = of
word = chemical
word = weapons
word = .

Process finished with exit code 0


import edu.stanford.nlp.pipeline.StanfordCoreNLP;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Properties;

 * Annotations、Annotators
 * 这两种类是CoreNLP里面的基本架构。
 *      Annotations: 表示一类数据结构,CoreNLP的整个工具包里面输入和输出都是这种数据结构,因此我们自己的文本(一般是String类型)要传给CoreNLP使用,需要先转换成一种Annotations。
 *      Annotators: 是一类功能类,比如我们想分词或者断句等,这每一项功能对应一种annotator类,annotator类接受一种annotation作为输入,然后输出一种annotation。
 * */
public class StanfordEnglishNlpFileList {
    public static void main(String[] args) throws IOException {
        StanfordEnglishNlpFileList example = new StanfordEnglishNlpFileList();
        // 将待处理文件列表保存入临时文件
        String raw_stories_dir = "D:\\NLP_Data\\cnn_stories_dir";
        String tokenized_stories_dir = "D:\\NLP_Data\\cnn_tokenized_stories_dir";

        File file_dir = new File(raw_stories_dir);
        File[] fileList = file_dir.listFiles();
        // 将从raw_stories_dir文件夹读取的所有文件的绝对地址写入mapping_for_corenlp.txt
        String rawFileList = "mapping_for_corenlp.txt";
        FileWriter writer = new FileWriter(rawFileList, true);
        for (File file : fileList) {
            if (file.isFile()) {

        // 运行 PipeLine
        example.runPipeLine(rawFileList, tokenized_stories_dir);

        // 删除临时文件
        File file = new File(rawFileList);
        if (file.exists()) {

    public void runPipeLine(String rawFileList, String tokenized_stories_dir) throws IOException {
        // set up pipeline properties
        Properties props = new Properties();
        // 设置相应的properties
        props.setProperty("annotators", "tokenize, ssplit");// props.put("annotators", "tokenize, ssplit"); tokenize: 分词; ssplit: 断句【tokenize,ssplit,pos,lemma,ner,parse,depparse,coref,kbp,quote】
        props.setProperty("filelist", rawFileList);
        props.setProperty("outputFormat", "json");
        props.setProperty("ssplit.newlineIsSentenceBreak", "always");
        props.setProperty("outputDirectory", tokenized_stories_dir);
        // 获得StanfordCoreNLP 对象
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        // 运行pipeline批处理


三、在Python中调用Stanford CoreNLP的Java语言的Jar包进行pipeline流处理

import jpype  # =0.7.0
import jpype.imports
import os.path

def python2java(mapping_for_corenlp, tokenized_stories_dir):
    # 获得系统的jvm路径
    jvmPath = jpype.getDefaultJVMPath()  # C:\Program Files\Java\jre1.8.0_40\bin\server\jvm.dll
    # 获取需要用的jar包路径
    jarpath = r'D:\NLPUtiles\stanford-corenlp-4.2.2\stanford-corenlp-4.2.2.jar'
    jpype.startJVM(jvmPath, "-ea", "-Djava.class.path=%s" % jarpath)
    # 获取Properties对象
    Properties = jpype.JClass('java.util.Properties')
    props = Properties()
    # 设置相应的properties
    props.setProperty("annotators", "tokenize, ssplit")  # props.put("annotators", "tokenize, ssplit"); tokenize: 分词; ssplit: 断句【tokenize, ssplit, pos, lemma, ner, parse, depparse, coref, kbp, quote】
    props.setProperty("filelist", mapping_for_corenlp)
    props.setProperty("outputFormat", "json")
    props.setProperty("ssplit.newlineIsSentenceBreak", "always")
    props.setProperty("outputDirectory", tokenized_stories_dir)
    jpype.java.lang.System.out.println("props" + props.toString())
    # 获得StanfordCoreNLP 对象
    StanfordCoreNLP = jpype.JClass('edu.stanford.nlp.pipeline.StanfordCoreNLP')
    pipeline = StanfordCoreNLP(props)
    # 运行pipeline批处理

if __name__ == '__main__':
    mapping_for_corenlp = "mapping_for_corenlp.txt"
    tokenized_stories_path = r"D:\NLP_Data\cnn_tokenized_stories_dir"
    python2java(mapping_for_corenlp, tokenized_stories_path)

