src/edu/stanford/nlp/pipeline/CoreNLP.proto

syntax = "proto2";

package edu.stanford.nlp.pipeline;

option java_package = "edu.stanford.nlp.pipeline";
option java_outer_classname = "CoreNLPProtos";

//
// From JAVANLP_HOME, you can build me with the command:
//
//  protoc -I=src/edu/stanford/nlp/pipeline/ --java_out=src src/edu/stanford/nlp/pipeline/CoreNLP.proto
//

//
// To do the python version:
//
//  protoc -I=./doc --python_out=./stanza/protobuf ./doc/CoreNLP.proto
//

//
// An enumeration for the valid languages allowed in CoreNLP
//
enum Language {
  Unknown  = 0;
  Any      = 1;
  Arabic   = 2;
  Chinese  = 3;
  English  = 4;
  German   = 5;
  French   = 6;
  Hebrew   = 7;
  Spanish  = 8;
  UniversalEnglish = 9;
  UniversalChinese = 10;
}

//
// A document; that is, the equivalent of an Annotation.
//
message Document {
  required string     text        = 1;
  repeated Sentence   sentence    = 2;
  repeated CorefChain corefChain  = 3;
  optional string     docID       = 4;
  optional string     docDate     = 7;
  optional uint64     calendar    = 8;

  /**
   * A peculiar field, for the corner case when a Document is
   * serialized without any sentences. Otherwise
   */
  repeated Token      sentencelessToken = 5;
  repeated Token      character = 10;

  repeated Quote      quote = 6;

  /**
   * This field is for entity mentions across the document.
   */
  repeated NERMention mentions = 9;
  optional bool hasEntityMentionsAnnotation = 13; // used to differentiate between null and empty list

  /**
   * xml information
   */
  optional bool    xmlDoc = 11;
  repeated Section sections = 12;

  /** coref mentions for entire document **/
  repeated Mention         mentionsForCoref                    = 14;
  optional bool hasCorefMentionAnnotation = 15;
  optional bool hasCorefAnnotation = 16;
  repeated int32 corefMentionToEntityMentionMappings = 17;
  repeated int32 entityMentionToCorefMentionMappings = 18;

  extensions 100 to 255;
}

//
// The serialized version of a CoreMap representing a sentence.
//
message Sentence {
  repeated Token            token                               = 1;
  required uint32           tokenOffsetBegin                    = 2;
  required uint32           tokenOffsetEnd                      = 3;
  optional uint32           sentenceIndex                       = 4;
  optional uint32           characterOffsetBegin                = 5;
  optional uint32           characterOffsetEnd                  = 6;
  optional ParseTree        parseTree                           = 7;
  optional ParseTree        binarizedParseTree                  = 31;
  optional ParseTree        annotatedParseTree                  = 32;
  optional string           sentiment                           = 33;
  repeated ParseTree        kBestParseTrees                     = 34;
  optional DependencyGraph  basicDependencies                   = 8;
  optional DependencyGraph  collapsedDependencies               = 9;
  optional DependencyGraph  collapsedCCProcessedDependencies    = 10;
  optional DependencyGraph  alternativeDependencies             = 13;
  repeated RelationTriple   openieTriple                        = 14;   // The OpenIE triples in the sentence
  repeated RelationTriple   kbpTriple                           = 16;   // The KBP triples in this sentence
  repeated SentenceFragment entailedSentence                    = 15;   // The entailed sentences, by natural logic
  repeated SentenceFragment entailedClause                      = 35;   // The entailed clauses, by natural logic
  optional DependencyGraph  enhancedDependencies                = 17;
  optional DependencyGraph  enhancedPlusPlusDependencies        = 18;
  repeated Token            character                           = 19;

  optional uint32           paragraph                           = 11;

  optional string           text                                = 12;   // Only needed if we're only saving the sentence.

  optional uint32           lineNumber                          = 20;

  // Fields set by other annotators in CoreNLP
  optional bool            hasRelationAnnotations              = 51;
  repeated Entity          entity                              = 52;
  repeated Relation        relation                            = 53;
  optional bool            hasNumerizedTokensAnnotation        = 54;
  repeated NERMention      mentions                            = 55;
  repeated Mention         mentionsForCoref                    = 56;
  optional bool            hasCorefMentionsAnnotation          = 57;

  optional string          sentenceID                          = 58;  // Useful when storing sentences (e.g. ForEach)
  optional string          sectionDate                         = 59;  // date of section
  optional uint32          sectionIndex                        = 60;  // section index for this sentence's section
  optional string          sectionName                         = 61;  // name of section
  optional string          sectionAuthor                       = 62;  // author of section
  optional string          docID                               = 63;  // doc id
  optional bool            sectionQuoted                       = 64;  // is this sentence in an xml quote in a post

  optional bool            hasEntityMentionsAnnotation         = 65;  // check if there are entity mentions
  optional bool            hasKBPTriplesAnnotation             = 68;  // check if there are KBP triples
  optional bool            hasOpenieTriplesAnnotation          = 69;  // check if there are OpenIE triples

  // quote stuff
  optional uint32             chapterIndex                     = 66;
  optional uint32             paragraphIndex                   = 67;
  // the quote annotator can soometimes add merged sentences
  optional Sentence           enhancedSentence                 = 70;

  // speaker stuff
  optional string          speaker                             = 71;  // The speaker speaking this sentence
  optional string          speakerType                         = 72;  // The type of speaker speaking this sentence

  extensions 100 to 255;
}

//
// The serialized version of a Token (a CoreLabel).
//
message Token {
  // Fields set by the default annotators [new CoreNLP(new Properties())]
  optional string word              = 1;    // the word's gloss (post-tokenization)
  optional string pos               = 2;    // The word's part of speech tag
  optional string value             = 3;    // The word's 'value', (e.g., parse tree node)
  optional string category          = 4;    // The word's 'category' (e.g., parse tree node)
  optional string before            = 5;    // The whitespace/xml before the token
  optional string after             = 6;    // The whitespace/xml after the token
  optional string originalText      = 7;    // The original text for this token
  optional string ner               = 8;    // The word's NER tag
  optional string coarseNER         = 62;   // The word's coarse NER tag
  optional string fineGrainedNER    = 63;   // The word's fine-grained NER tag
  repeated string nerLabelProbs     = 66;   // listing of probs
  optional string normalizedNER     = 9;    // The word's normalized NER tag
  optional string lemma             = 10;   // The word's lemma
  optional uint32 beginChar         = 11;   // The character offset begin, in the document
  optional uint32 endChar           = 12;   // The character offset end, in the document
  optional uint32 utterance         = 13;   // The utterance tag used in dcoref
  optional string speaker           = 14;   // The speaker speaking this word
  optional string speakerType       = 77;   // The type of speaker speaking this word
  optional uint32 beginIndex        = 15;   // The begin index of, e.g., a span
  optional uint32 endIndex          = 16;   // The begin index of, e.g., a span
  optional uint32 tokenBeginIndex   = 17;   // The begin index of the token
  optional uint32 tokenEndIndex     = 18;   // The end index of the token
  optional Timex  timexValue        = 19;   // The time this word refers to
  optional bool   hasXmlContext     = 21;   // Used by clean xml annotator
  repeated string xmlContext        = 22;   // Used by clean xml annotator
  optional uint32 corefClusterID    = 23;   // The [primary] cluster id for this token
  optional string answer            = 24;   // A temporary annotation which is occasionally left in
  //  optional string projectedCategory = 25;   // The syntactic category of the maximal constituent headed by the word. Not used anywhere, so deleted.
  optional uint32    headWordIndex  = 26;   // The index of the head word of this word.
  optional Operator  operator       = 27;   // If this is an operator, which one is it and what is its scope (as per Natural Logic)?
  optional Polarity  polarity       = 28;   // The polarity of this word, according to Natural Logic
  optional string    polarity_dir   = 39;   // The polarity of this word, either "up", "down", or "flat"
  optional Span      span           = 29;   // The span of a leaf node of a tree
  optional string    sentiment      = 30;   // The final sentiment of the sentence
  optional int32     quotationIndex = 31;   // The index of the quotation this token refers to
  optional MapStringString conllUFeatures = 32;
  optional string coarseTag         = 33; //  The coarse POS tag (used to store the UPOS tag)
  optional Span conllUTokenSpan     = 34;
  optional string conllUMisc        = 35;
  optional MapStringString conllUSecondaryDeps = 36;
  optional string   wikipediaEntity = 37;
  optional bool     isNewline = 38;


  // Fields set by other annotators in CoreNLP
  optional string gender          = 51;  // gender annotation (machine reading)
  optional string trueCase        = 52;  // true case type of token
  optional string trueCaseText    = 53;  // true case gloss of token

  //  Chinese character info
  optional string chineseChar     = 54;
  optional string chineseSeg      = 55;
  optional string chineseXMLChar  = 60;

  //  Arabic character info
  optional string arabicSeg       = 76;

  // Section info
  optional string sectionName     = 56;
  optional string sectionAuthor   = 57;
  optional string sectionDate     = 58;
  optional string sectionEndLabel = 59;

  // French tokens have parents
  optional string parent          = 61;

  // mention index info
  repeated uint32 corefMentionIndex = 64;
  optional uint32 entityMentionIndex = 65;

  // mwt stuff
  optional bool isMWT = 67;
  optional bool isFirstMWT = 68;
  optional string mwtText = 69;

  // number info
  optional uint64 numericValue = 70;
  optional string numericType = 71;
  optional uint64 numericCompositeValue = 72;
  optional string numericCompositeType = 73;

  optional uint32 codepointOffsetBegin   = 74;
  optional uint32 codepointOffsetEnd     = 75;

  // Fields in the CoreLabel java class that are moved elsewhere
  //       string text           @see Document#text + character offsets
  //       uint32 sentenceIndex  @see Sentence#sentenceIndex
  //       string docID          @see Document#docID
  //       uint32 index          @see implicit in Sentence
  //       uint32 paragraph      @see Sentence#paragraph

  extensions 100 to 255;
}

//
// An enumeration of valid sentiment values for the sentiment classifier.
//
enum Sentiment {
  STRONG_NEGATIVE   = 0;
  WEAK_NEGATIVE     = 1;
  NEUTRAL           = 2;
  WEAK_POSITIVE     = 3;
  STRONG_POSITIVE   = 4;
}

//
// A quotation marker in text
//
message Quote {
  optional string text           = 1;
  optional uint32 begin          = 2;
  optional uint32 end            = 3;
  optional uint32 sentenceBegin  = 5;
  optional uint32 sentenceEnd    = 6;
  optional uint32 tokenBegin     = 7;
  optional uint32 tokenEnd       = 8;
  optional string docid          = 9;
  optional uint32 index          = 10;
  optional string author         = 11;
  optional string mention        = 12;
  optional uint32 mentionBegin   = 13;
  optional uint32 mentionEnd     = 14;
  optional string mentionType    = 15;
  optional string mentionSieve   = 16;
  optional string speaker        = 17;
  optional string speakerSieve   = 18;
  optional string canonicalMention = 19;
  optional uint32 canonicalMentionBegin = 20;
  optional uint32 canonicalMentionEnd = 21;
  optional DependencyGraph attributionDependencyGraph = 22;
}

//
// A syntactic parse tree, with scores.
//
message ParseTree {
  repeated ParseTree child           = 1;
  optional string    value           = 2;
  optional uint32    yieldBeginIndex = 3;
  optional uint32    yieldEndIndex   = 4;
  optional double    score           = 5;
  optional Sentiment sentiment       = 6;
}

//
// A dependency graph representation.
//
message DependencyGraph {
  message Node {
    required uint32 sentenceIndex  = 1;
    required uint32 index          = 2;
    optional uint32 copyAnnotation = 3;
  }

  message Edge {
    required uint32 source     = 1;
    required uint32 target     = 2;
    optional string dep        = 3;
    optional bool   isExtra    = 4;
    optional uint32 sourceCopy = 5;
    optional uint32 targetCopy = 6;
    optional Language language = 7 [default=Unknown];
  }
  
  repeated Node     node     = 1;
  repeated Edge     edge     = 2;
  repeated uint32   root     = 3 [packed=true];
}

//
// A coreference chain.
// These fields are not *really* optional. CoreNLP will crash without them.
//
message CorefChain {
  message CorefMention {
    optional int32  mentionID          = 1;
    optional string mentionType        = 2;
    optional string number             = 3;
    optional string gender             = 4;
    optional string animacy            = 5;
    optional uint32 beginIndex         = 6;
    optional uint32 endIndex           = 7;
    optional uint32 headIndex          = 9;
    optional uint32 sentenceIndex      = 10;
    optional uint32 position           = 11;  // the second element of position
  }

  required int32        chainID        = 1;
  repeated CorefMention mention        = 2;
  required uint32       representative = 3;
}

//
// a mention
//

message Mention {
  optional int32 mentionID             = 1;
  optional string mentionType          = 2;
  optional string number               = 3;
  optional string gender               = 4;
  optional string animacy              = 5;
  optional string person               = 6;
  optional uint32 startIndex           = 7;
  optional uint32 endIndex             = 9;
  optional int32 headIndex             = 10;
  optional string headString           = 11;
  optional string nerString            = 12;
  optional int32 originalRef           = 13;
  optional int32 goldCorefClusterID    = 14;
  optional int32 corefClusterID        = 15;
  optional int32 mentionNum            = 16;
  optional int32 sentNum               = 17;
  optional int32 utter                 = 18;
  optional int32 paragraph             = 19;
  optional bool isSubject              = 20;
  optional bool isDirectObject         = 21;
  optional bool isIndirectObject       = 22;
  optional bool isPrepositionObject    = 23;
  optional bool hasTwin                = 24;
  optional bool generic                = 25;
  optional bool isSingleton            = 26;
  optional bool hasBasicDependency     = 27;
  optional bool hasEnhancedDepenedncy  = 28;
  optional bool hasContextParseTree    = 29;
  optional IndexedWord headIndexedWord = 30;
  optional IndexedWord   dependingVerb = 31;
  optional IndexedWord       headWord  = 32;
  optional SpeakerInfo    speakerInfo  = 33;

  repeated IndexedWord         sentenceWords = 50;
  repeated IndexedWord         originalSpan = 51;
  repeated string dependents           = 52;
  repeated string preprocessedTerms    = 53;
  repeated int32 appositions = 54;
  repeated int32 predicateNominatives = 55;
  repeated int32 relativePronouns = 56;
  repeated int32 listMembers = 57;
  repeated int32 belongToLists = 58;

}

//
// store the position (sentence, token index) of a CoreLabel
//

message IndexedWord {
  optional  int32 sentenceNum          = 1;
  optional  int32 tokenIndex           = 2;
  optional  int32 docID                = 3;
  optional uint32 copyCount            = 4;
}

//
// speaker info, this is used for Mentions
//

message SpeakerInfo {
  optional string speakerName          = 1;
  repeated int32 mentions    = 2;
}

//
// A Span of text
//
message Span {
  required uint32 begin      = 1;
  required uint32 end        = 2;
}

//
// A Timex object, representing a temporal expression (TIMe EXpression)
// These fields are not *really* optional. CoreNLP will crash without them.
//
message Timex {
  optional string value      = 1;
  optional string altValue   = 2;
  optional string text       = 3;
  optional string type       = 4;
  optional string tid        = 5;
  optional uint32 beginPoint = 6;
  optional uint32 endPoint   = 7;
}

//
// A representation of an entity in a relation.
// This corresponds to the EntityMention, and more broadly the
// ExtractionObject classes.
//
message Entity {
  optional uint32 headStart      = 6;
  optional uint32 headEnd        = 7;
  optional string mentionType    = 8;
  optional string normalizedName = 9;
  optional uint32 headTokenIndex = 10;
  optional string corefID        = 11;
  // inherited from ExtractionObject
  optional string objectID = 1;
  optional uint32 extentStart    = 2;
  optional uint32 extentEnd      = 3;
  optional string type           = 4;
  optional string subtype        = 5;
  // Implicit
  //       uint32 sentence       @see implicit in sentence
}

//
// A representation of a relation, mirroring RelationMention
//
message Relation {
  repeated string argName   = 6;
  repeated Entity arg       = 7;
  optional string signature = 8;
  // inherited from ExtractionObject
  optional string objectID = 1;
  optional uint32 extentStart    = 2;
  optional uint32 extentEnd      = 3;
  optional string type           = 4;
  optional string subtype        = 5;
  // Implicit
  //       uint32 sentence       @see implicit in sentence
}

//
// A Natural Logic operator
//
message Operator {
  required string name                = 1;
  required int32  quantifierSpanBegin = 2;
  required int32  quantifierSpanEnd   = 3;
  required int32  subjectSpanBegin    = 4;
  required int32  subjectSpanEnd      = 5;
  required int32  objectSpanBegin     = 6;
  required int32  objectSpanEnd       = 7;
}

//
// The seven informative Natural Logic relations
//
enum NaturalLogicRelation {
  EQUIVALENCE        = 0;
  FORWARD_ENTAILMENT = 1;
  REVERSE_ENTAILMENT = 2;
  NEGATION           = 3;
  ALTERNATION        = 4;
  COVER              = 5;
  INDEPENDENCE       = 6;
}

//
// The polarity of a word, according to Natural Logic
//
message Polarity {
  required NaturalLogicRelation projectEquivalence       = 1;
  required NaturalLogicRelation projectForwardEntailment = 2;
  required NaturalLogicRelation projectReverseEntailment = 3;
  required NaturalLogicRelation projectNegation          = 4;
  required NaturalLogicRelation projectAlternation       = 5;
  required NaturalLogicRelation projectCover             = 6;
  required NaturalLogicRelation projectIndependence      = 7;
}

//
// An NER mention in the text
//
message NERMention {
  optional uint32 sentenceIndex                 = 1;
  required uint32 tokenStartInSentenceInclusive = 2;
  required uint32 tokenEndInSentenceExclusive   = 3;
  required string ner                           = 4;
  optional string normalizedNER                 = 5;
  optional string entityType                    = 6;
  optional Timex  timex                         = 7;
  optional string wikipediaEntity               = 8;
  optional string gender                        = 9;
  optional uint32 entityMentionIndex            = 10;
  optional uint32 canonicalEntityMentionIndex   = 11;
  optional string entityMentionText             = 12;
}

//
// An entailed sentence fragment.
// Created by the openie annotator.
//
message SentenceFragment {
  repeated uint32 tokenIndex     = 1;
  optional uint32 root           = 2;
  optional bool   assumedTruth   = 3;
  optional double score          = 4;
}


//
// The index of a token in a document, including the sentence
// index and the offset.
//
message TokenLocation {
 optional uint32 sentenceIndex = 1;
 optional uint32 tokenIndex    = 2;

}


//
// An OpenIE relation triple.
// Created by the openie annotator.
//
message RelationTriple {
  optional string          subject        = 1;   // The surface form of the subject
  optional string          relation       = 2;   // The surface form of the relation (required)
  optional string          object         = 3;   // The surface form of the object
  optional double          confidence     = 4;   // The [optional] confidence of the extraction
  repeated TokenLocation   subjectTokens  = 13; // The tokens comprising the subject of the triple
  repeated TokenLocation   relationTokens = 14; // The tokens comprising the relation of the triple
  repeated TokenLocation   objectTokens   = 15; // The tokens comprising the object of the triple
  optional DependencyGraph tree           = 8;   // The dependency graph fragment for this triple
  optional bool            istmod         = 9;   // If true, this expresses an implicit tmod relation
  optional bool            prefixBe       = 10;  // If true, this relation string is missing a 'be' prefix
  optional bool            suffixBe       = 11;  // If true, this relation string is missing a 'be' suffix
  optional bool            suffixOf       = 12;  // If true, this relation string is missing a 'of' prefix
}


//
// A map from strings to strings.
// Used, minimally, in the CoNLLU featurizer
//
message MapStringString {
  repeated string key   = 1;
  repeated string value = 2;
}

//
// A map from integers to strings.
// Used, minimally, in the CoNLLU featurizer
//
message MapIntString {
  repeated uint32 key   = 1;
  repeated string value = 2;
}

//
// Store section info
//

message Section {
  required uint32 charBegin         = 1;
  required uint32 charEnd           = 2;
  optional string author            = 3;
  repeated uint32 sentenceIndexes   = 4;
  optional string datetime          = 5;
  repeated Quote quotes             = 6;
  optional uint32 authorCharBegin   = 7;
  optional uint32 authorCharEnd     = 8;
  required Token xmlTag             = 9;
}


// A message for requesting a semgrex
// Each sentence stores information about the tokens making up the
// corresponding graph
// An alternative would have been to use the existing Document or
// Sentence classes, but the problem with that is it would be
// ambiguous which dependency object to use.
message SemgrexRequest {
  message Dependencies {
    repeated Token           token       = 1;
    required DependencyGraph graph       = 2;
  }

  repeated string            semgrex     = 1;
  repeated Dependencies      query       = 2;
}

// The response from running a semgrex
// If you pass in M semgrex expressions and N dependency graphs,
// this returns MxN nested results.  Each SemgrexResult can match
// multiple times in one graph
message SemgrexResponse {
  message NamedNode {
    required string          name        = 1;
    required int32           matchIndex  = 2;
  }

  message NamedRelation {
    required string          name        = 1;
    required string          reln        = 2;
  }

  message Match {
    required int32           matchIndex  = 1;
    repeated NamedNode       node        = 2;
    repeated NamedRelation   reln        = 3;
  }

  message SemgrexResult {
    repeated Match           match       = 1;
  }

  message GraphResult {
    repeated SemgrexResult   result      = 1;
  }

  repeated GraphResult       result      = 1;
}


// It's possible to send in a whole document, but we
// only care about the Sentences and Tokens
message TokensRegexRequest {
  required Document          doc         = 1;
  repeated string            pattern     = 2;
}

// The result will be a nested structure:
// repeated PatternMatch, one for each pattern
// each PatternMatch has a repeated Match,
// which tells you which sentence matched and where
message TokensRegexResponse {
  message MatchLocation {
    optional string          text        = 1;
    optional int32           begin       = 2;
    optional int32           end         = 3;
  }

  message Match {
    required int32           sentence    = 1;
    required MatchLocation   match       = 2;
    repeated MatchLocation   group       = 3;
  }

  message PatternMatch {
    repeated Match           match       = 1;
  }

  repeated PatternMatch      match       = 1;
}

// A protobuf which allows to pass in a document with basic
// dependencies to be converted to enhanced
message DependencyEnhancerRequest {
  required Document          document           = 1;

  oneof ref {
    Language          language           = 2;
    // The expected value of this is a regex which matches relative pronouns
    string            relativePronouns   = 3;
  }
}

// A version of ParseTree with a flattened structure so that deep trees
// don't exceed the protobuf stack depth
message FlattenedParseTree {
  message Node {
    oneof contents {
      bool              openNode           = 1;
      bool              closeNode          = 2;
      string            value              = 3;
    }

    optional double     score              = 4;
  }

  repeated Node         nodes              = 1;
}

// A protobuf for calling the java constituency parser evaluator from elsewhere
message EvaluateParserRequest {
  message ParseResult {
    required FlattenedParseTree         gold           = 1;
    // repeated so you can send in kbest parses, if your parser handles that
    // note that this already includes a score field
    repeated FlattenedParseTree         predicted      = 2;
  }

  repeated ParseResult         treebank       = 1;
}

message EvaluateParserResponse {
  required double              f1             = 1;
  optional double              kbestF1        = 2;
}


// A protobuf for running Tsurgeon operations on constituency trees
message TsurgeonRequest {
  message Operation {
    required string                tregex         = 1;
    repeated string                tsurgeon       = 2;
  }
  repeated Operation               operations     = 1;
  repeated FlattenedParseTree      trees          = 2;
}

// The results of the Tsurgeon operation
message TsurgeonResponse {
  repeated FlattenedParseTree      trees          = 1;
}