Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
languagexml
titleClearByMimeMetadataFilter
<?xml version="1.0" encoding="UTF-8"?>
<properties>
  <metadataFilters>
    <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter">
      <params>
		<!-- this will remove metadata objects for jpegs and pdfs; more seriously, this
 may be useful for image files or
      this may be useful for image files or emf or wmf depending on your use case -->
        <mimes>
          <mime>image/jpeg</mime>
          <mime>application/pdf</mime>
        </mimes>
      </params>
    </metadataFilter>
  </metaFilters>
</properties

...

Code Block
languagexml
titleStandardWriteFilter
<?xml version="1.0" encoding="UTF-8"?>
<properties>
  <autoDetectParserConfig>
    <metadataWriteFilterFactory class="org.apache.tika.metadata.writefilter.StandardWriteFilterFactory">
      <params>
		<!-- all measurements are in UTF-16 bytes. If any values are truncated, 
			TikaCoreProperties.TRUNCATED_METADATA is set to true in the metadata object -->

        <!-- the maximum size for a metadata key. -->
        <maxKeySize>1000</maxKeySize>

        <!-- max total size for a field in UTF-16 bytes.  If a field has multiple values, 
			their lengths are summed to calculate the field size. -->
        <maxFieldSize>10000</maxFieldSize>

        <!-- max total estimated byte is a sum of the key sizes and values -->
        <maxTotalEstimatedBytes>100000</maxTotalEstimatedBytes>
  
        <!-- limit the count of values for multi-valued fields -->
        <maxValuesPerField>100</maxValuesPerField>
        <!-- include only these fields. NOTE, however that there a several fields that are 
			 important to the 
             parse process and these fields are always allowed in addition 
			 (see ALWAYS_SET_FIELDS and ALWAYS_ADD_FIELDS 
             in the StandardWriteFilter -->
        <includeFields>
          <field>dc:creator</field>
          <field>dc:title</field>
        </includeFields>
      </params>
    </metadataWriteFilterFactory>
  </autoDetectParserConfig>
</properties>

...

Code Block
languagexml
titleAutoDetectParserConfig
<?xml version="1.0" encoding="UTF-8"?>
<properties>
  <autoDetectParserConfig>
    <params>
      <!-- if the incoming metadata object has a ContentLength entry and it is larger than this
           value, spool the file to disk; this is useful for some file formats that are more efficiently
           processed via a file instead of an InputStream -->
      <spoolToDisk>100000</spoolToDisk>
      <!-- the next four are parameters for the SecureContentHandler -->
      <!-- threshold used in zip bomb detection. This many characters must be written
           before the maximum compression ratio is calculated -->
      <outputThreshold>10000</outputThreshold>
      <!-- maximum compression ratio between output characters and input bytes -->
      <maximumCompressionRation>100</maximumCompressionRatio>
      <!-- maximum XML element nesting level -->
      <maximumDepth>100</maximumDepth>
      <!-- maximum embedded file depth -->
      <maximumPackageEntryDepth>100</maximumPackageEntryDepth>
    </params>
  </autoDetectParserConfig>
</properties>


TODO: add an example of the EmbeddedDocumentExtractorFactory

TODO: add a 5th? section for writelimiting