Page History

Introduction

Using the amazing IKVM http://www.ikvm.net/index.html OpenNLP's java files can be converted to a .Net assembly (dll).
Thus allowing you to use the latest releases of OpenNLP from C# (or any other .net language).
So far the .Net assembly has succesfully been used for: Splitting, Tokenising, POS Tagging & Chunking. Full parsing has yet to be fully tested.

Guide

(Don't forget to unblock any downloaded files)

...

Add references to these assemblies in your project & use at will
The OpenNlp manual is at http://incubator.apache.org/opennlp/documentation/manual/opennlp.html^{Image Removed}

You will need the models for your language which are currently here http://opennlp.sourceforge.net/models-1.5/

...

Code Block

title	sample.cs
borderStyle	solid

string modelpath = "C:\models\"; \\Wherever you've stored your downloaded models
java.io.FileInputStream modelInpStream = new java.io.FileInputStream(ModelPath + "en-sent.bin");
opennlp.tools.sentdetect.SentenceModel sentenceModel =new opennlp.tools.sentdetect.SentenceModel(modelInpStream);
opennlp.tools.sentdetect.SentenceDetectorME SentenceDetectorME=new opennlp.tools.sentdetect.SentenceDetectorME(sentenceModel);

Mostly though it seems to be very straightforward & works well.

Once set up it takes only a few seconds to create an opennlp.dll assembly from the latest releases
and so it is very easy to keep it bang up to date.

...

Code Block

title	EntityExtractor.cs
borderStyle	solid


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;


namespace NaturalLanguageProcessingCSharp
{
    
    public class EntityExtractor
    {
        /// <summary>
        /// Extractor for the entity types available in openNLP.
        /// Copyright 2013, Don Krapohl www.augmentedintel.com
        /// This source is free for unlimited distribution and use
        /// TODO:
        ///     try/catch/exception handling
        ///     filestream closure
        ///     model training if desired
        ///     Regex or dictionary entity extraction
        ///     clean up the setting of the Name Finder model path
        /// </summary>
        /// Call syntax:  myList = ExtractEntities(myInText, EntityType.Person);


        private string sentenceModelPath = "c:\\models\\en-sent.bin";   //path to the model for sentence detection
        private string nameFinderModelPath;                              //NameFinder model path for English names
        private string tokenModelPath = "c:\\models\\en-token.bin";     //model path for English tokens
        public enum EntityType
        {
            Date = 0,
            Location,
            Money,
            Organization,
            Person,
            Time
        }

        public List<string> ExtractEntities(string inputData, EntityType targetType)
        {
            /*required steps to detect names are:
             * downloaded sentence, token, and name models from http://opennlp.sourceforge.net/models-1.5/
             * 1. Parse the input into sentences
             * 2. Parse the sentences into tokens
             * 3. Find the entity in the tokens

            */

            //------------------Preparation -- Set Name Finder model path based upon entity type-----------------
            switch (targetType)
            {
                case EntityType.Date:
                    nameFinderModelPath = "c:\\models\\en-ner-date.bin";
                    break;
                case EntityType.Location:
                    nameFinderModelPath = "c:\\models\\en-ner-location.bin";
                    break;
                case EntityType.Money:
                    nameFinderModelPath = "c:\\models\\en-ner-money.bin";
                    break;
                case EntityType.Organization:
                    nameFinderModelPath = "c:\\models\\en-ner-organization.bin";
                    break;
                case EntityType.Person:
                    nameFinderModelPath = "c:\\models\\en-ner-person.bin";
                    break;
                case EntityType.Time:
                    nameFinderModelPath = "c:\\models\\en-ner-time.bin";
                    break;
                default:
                    break;
            }

            //----------------- Preparation -- load models into objects-----------------
            //initialize the sentence detector
            opennlp.tools.sentdetect.SentenceDetectorME sentenceParser = prepareSentenceDetector();

            //initialize person names model
            opennlp.tools.namefind.NameFinderME nameFinder =  prepareNameFinder();

            //initialize the tokenizer--used to break our sentences into words (tokens)
            opennlp.tools.tokenize.TokenizerME tokenizer = prepareTokenizer();

            //------------------  Make sentences, then tokens, then get names--------------------------------

            String[] sentences = sentenceParser.sentDetect(inputData) ; //detect the sentences and load into sentence array of strings
            List<string> results = new List<string>();

            foreach (string sentence in sentences)
            {
                //now tokenize the input.
                //"Don Krapohl enjoys warm sunny weather" would tokenize as
                //"Don", "Krapohl", "enjoys", "warm", "sunny", "weather"
                string[] tokens = tokenizer.tokenize(sentence);

                //do the find
                opennlp.tools.util.Span[] foundNames = nameFinder.find(tokens);

                //important:  clear adaptive data in the feature generators or the detection rate will decrease over time.
                nameFinder.clearAdaptiveData();

                results.AddRange( opennlp.tools.util.Span.spansToStrings(foundNames, tokens).AsEnumerable());
            }

            return results;
        }

#region private methods
        private opennlp.tools.tokenize.TokenizerME prepareTokenizer()
        {
            java.io.FileInputStream tokenInputStream = new java.io.FileInputStream(tokenModelPath);     //load the token model into a stream
            opennlp.tools.tokenize.TokenizerModel tokenModel = new opennlp.tools.tokenize.TokenizerModel(tokenInputStream); //load the token model
            return new opennlp.tools.tokenize.TokenizerME(tokenModel);  //create the tokenizer
        }
        private opennlp.tools.sentdetect.SentenceDetectorME prepareSentenceDetector()
        {
            java.io.FileInputStream sentModelStream = new java.io.FileInputStream(sentenceModelPath);       //load the sentence model into a stream
            opennlp.tools.sentdetect.SentenceModel sentModel = new opennlp.tools.sentdetect.SentenceModel(sentModelStream);// load the model
            return new opennlp.tools.sentdetect.SentenceDetectorME(sentModel); //create sentence detector
        }
        private opennlp.tools.namefind.NameFinderME prepareNameFinder()
        {
            java.io.FileInputStream modelInputStream = new java.io.FileInputStream(nameFinderModelPath); //load the name model into a stream
            opennlp.tools.namefind.TokenNameFinderModel model = new opennlp.tools.namefind.TokenNameFinderModel(modelInputStream); //load the model
            return new opennlp.tools.namefind.NameFinderME(model);                   //create the namefinder
        }
#endregion 
    }
}

Notes

Workaround if an invalid format exception occurs when reading en-pos-maxent.bin
The file en-pos-maxent.bin is actually a zip archive.
If you examine the contents of this zip file, it currently has three files (the others seem to only have 2)
manifest.properties, tags.tagdict, & pos.model
Delete the tags.tagdict from the zipfile so that it only contains manifest.properties & pos.model
Note: Don't actually unzip en-pos-maxent.bin just delete tags.dagdict, so that en-pos-maxent.bin remains a Zip archive containing the remaining 2 files.

Child pages

Versions Compared

Old Version 1

New Version Current

Key

Introduction

Guide

Notes