Stormcrawler: Apache Tika для анализа свойств PDF

Я добавил Tika в качестве ссылки на мою реализацию StormCrawler, и это позволяет извлекать документы PDF в сканере. Но заголовок, авторы и другие свойства не анализируются. Я попытался с различными комбинациями к index.md.mapping: и добавил соответствующие свойства в ES_IndexInit, но поле содержимого в Kibana (index) для документов PDF всегда пусто. Все работает для HTML-страниц. Можете ли вы помочь с некоторыми указателями, если я что-то упустил, или я могу посмотреть на пример?


эс-crawler.flux:

name: "crawler"

includes: - resource: true file: "/crawler-default.yaml" override: false

- resource: false
  file: "crawler-conf.yaml"
  override: true

- resource: false
  file: "es-conf.yaml"
  override: true

spouts: - id: "spout" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout" параллелизм: 10

bolts: - id: "разделитель" className: "com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt" параллелизм: 1 - id: "fetcher" className: "com.digitalpebble.stormcrawler.bolt.FetcherBolt" параллелизм: 1 - id: " карта сайта " className: "com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt"параллелизм: 1 - id: "parse" className: "com.digitalpebble.stormcrawler.bolt.JSoupParserBolt"параллелизм: 5 - id: " com "classN.digitalpebble.stormcrawler.elasticsearch.bolt.IndexerBolt"параллелизм: 1 - id: "status" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt"параллелизм: 1 - id: status_metble comigame" className.stormcrawler.elasticsearch.metrics.StatusMetricsBolt"параллелизм: 4 - id: "redirection_bolt" className: "com.digitalpebble.stormcrawler.tika.RedirectionBolt"параллелизм: 1 - id: "parser_bolt" className: "com.st..ParserBolt"параллелизм: 1

потоки: - из: "spout" в: "разделитель" группировка: тип: SHUFFLE

  • от: "spout" до: "status_metrics" группировка: тип: SHUFFLE

  • from: "partitioner" to: "fetcher" группировка: тип: FIELDS args: ["key"]

  • от: "fetcher" до: "sitemap" группировка: тип: LOCAL_OR_SHUFFLE

  • от: "sitemap" до: "parse" группировки: тип: LOCAL_OR_SHUFFLE

  • от: "разбирать" до: "индексировать" группировка: тип: LOCAL_OR_SHUFFLE

  • from: "fetcher" to: "status" группировка: тип: FIELDS args: ["url"] streamId: "status"

  • from: "sitemap" to: "status" группировка: тип: FIELDS args: ["url"] streamId: "status"

  • from: "parse" to: "status" группировка: тип: FIELDS args: ["url"] streamId: "status"

  • from: "index" to: "status" группировка: тип: FIELDS args: ["url"] streamId: "status"

  • от: "parse" до: "redirection_bolt" группировка: тип: LOCAL_OR_SHUFFLE

  • от: "redirection_bolt" до: "parser_bolt" группировка: тип: LOCAL_OR_SHUFFLE

  • от: "redirection_bolt" до: "индекс" группировка: тип: LOCAL_OR_SHUFFLE

  • от: "parser_bolt" до: "индекс" группировка: тип: LOCAL_OR_SHUFFLE

эс-injector.flux: name: "injector"

includes: - resource: true file: "/crawler-default.yaml" override: false

- resource: false
  file: "crawler-conf.yaml"
  override: true

- resource: false
  file: "es-conf.yaml"
  override: true

- resource: false
  file: "injection-conf.yaml"
  override: true

Компоненты: - id: "схема" className: "com.digitalpebble.stormcrawler.util.StringTabScheme" constructorArgs: - ОБНАРУЖЕНО

spouts: - id: "spout" className: "com.digitalpebble.stormcrawler.spout.FileSpout" параллелизм: 1 constructorArgs: - "." - "seed.txt" - ref: "схема"

bolts: - id: "status" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt" параллелизм: 1 - id: "parser_bolt" className: "com.digitalpebble.stormcrawler.tika.ParserBolt" параллелизм 1:

потоки: - от: "spout" до: "статус" группировка: тип: FIELDS args: ["url"]

pom.xml: http://maven.apache.org/maven-v4_0_0.xsd">

<modelVersion>4.0.0</modelVersion>
<groupId>xyz.com</groupId>
<artifactId>search</artifactId>
<version>search1.0</version>
<packaging>jar</packaging>

<properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<build>
    <plugins>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-compiler-plugin</artifactId>
            <version>3.2</version>
            <configuration>
                <source>1.8</source>
                <target>1.8</target>
            </configuration>
        </plugin>
        <plugin>
            <groupId>org.codehaus.mojo</groupId>
            <artifactId>exec-maven-plugin</artifactId>
            <version>1.3.2</version>
            <executions>
                <execution>
                    <goals>
                        <goal>exec</goal>
                    </goals>
                </execution>
            </executions>
            <configuration>
                <executable>java</executable>
                <includeProjectDependencies>true</includeProjectDependencies>
                <includePluginDependencies>false</includePluginDependencies>
                <classpathScope>compile</classpathScope>
            </configuration>
        </plugin>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-shade-plugin</artifactId>
            <version>1.3.3</version>
            <executions>
                <execution>
                    <phase>package</phase>
                    <goals>
                        <goal>shade</goal>
                    </goals>
                    <configuration>
                        <createDependencyReducedPom>false</createDependencyReducedPom>
                        <transformers>
                            <transformer
                                implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
                            <transformer
                                implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                              <mainClass>org.apache.storm.flux.Flux</mainClass>
                              <manifestEntries>
                                <Change></Change>
                                <Build-Date></Build-Date>
                              </manifestEntries>
                            </transformer>
                        </transformers>
                        <!-- The filters below are necessary if you want to include the Tika
                            module -->
                        <filters>
                            <filter>
                                <artifact>*:*</artifact>
                                <excludes>
                                    <exclude>META-INF/*.SF</exclude>
                                    <exclude>META-INF/*.DSA</exclude>
                                    <exclude>META-INF/*.RSA</exclude>
                                </excludes>
                            </filter>
                        </filters>
                    </configuration>
                </execution>
            </executions>
        </plugin>
    </plugins>
</build>

<dependencies>
    <dependency>
        <groupId>org.apache.storm</groupId>
        <artifactId>storm-core</artifactId>
        <version>1.1.1</version>
        <scope>provided</scope>
    </dependency>
    <dependency>
        <groupId>org.apache.storm</groupId>
        <artifactId>flux-core</artifactId>
        <version>1.0.2</version>
    </dependency>
    <dependency>
        <groupId>com.digitalpebble.stormcrawler</groupId>
        <artifactId>storm-crawler-core</artifactId>
        <version>1.7</version>
    </dependency>
    <dependency>
        <groupId>com.digitalpebble.stormcrawler</groupId>
        <artifactId>storm-crawler-elasticsearch</artifactId>
        <version>1.7</version>
    </dependency>
    <dependency>
        <groupId>com.digitalpebble.stormcrawler</groupId>
        <artifactId>storm-crawler-tika</artifactId>
        <version>1.7</version>
    </dependency>
</dependencies>

1 ответ

Решение

Ваши файлы pom и flux выглядят хорошо. Вы можете поместить инъекцию как часть основного потока, чтобы все было просто.

Что находится в crawler-conf.yaml? Вы добавили префиксы к именам полей?

Вот метаданные, извлеченные из URL, который вы разместили выше

parse.dcterms:modified: 2004-09-29T20:21:18Z
parse.pdf:PDFVersion: 1.4
parse.access_permission:can_print: true
parse.pdf:docinfo:subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
parse.pdf:docinfo:modified: 2004-09-29T20:21:18Z
parse.access_permission:extract_for_accessibility: true
parse.created: Fri Sep 24 15:56:30 BST 2004
parse.pdf:docinfo:created: 2004-09-24T14:56:30Z
parse.xmpTPg:NPages: 7
parse.access_permission:fill_in_form: true
parse.producer: Adobe PDF Library 6.0
parse.pdf:docinfo:title: About Metadata
parse.pdf:docinfo:producer: Adobe PDF Library 6.0
parse.dc:format: application/pdf; version=1.4
parse.access_permission:assemble_document: true
parse.access_permission:modify_annotations: true
parse.dc:title: About Metadata
parse.access_permission:can_print_degraded: true
parse.xmpMM:DocumentID: adobe:docid:indd:de7d50b0-0fc1-11d9-b0d4-cd42e793ca90
parse.xmpMM:DerivedFrom:DocumentID: adobe:docid:indd:a04d199f-0f11-11d9-b74d-bb0abf4f1ab0
parse.title: About Metadata
parse.Creation-Date: 2004-09-24T14:56:30Z
parse.modified: 2004-09-29T20:21:18Z
parse.resourceName: /digitalimag/pdfs/about_metadata.pdf
parse.dc:description: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
parse.Last-Save-Date: 2004-09-29T20:21:18Z
parse.creator: Adobe Systems Incorporated
parse.pdf:encrypted: false
parse.trapped: False
parse.pdf:docinfo:creator: Adobe Systems Incorporated
parse.date: 2004-09-29T20:21:18Z
parse.meta:save-date: 2004-09-29T20:21:18Z
parse.Author: Adobe Systems Incorporated
parse.X-Parsed-By: org.apache.tika.parser.DefaultParser
parse.X-Parsed-By: org.apache.tika.parser.pdf.PDFParser
parse.pdf:docinfo:creator_tool: Adobe InDesign CS (3.0.1)
parse.dcterms:created: 2004-09-24T14:56:30Z
parse.access_permission:can_modify: true
parse.subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
parse.meta:author: Adobe Systems Incorporated
parse.access_permission:extract_content: true
parse.xmp:CreatorTool: Adobe InDesign CS (3.0.1)
parse.dc:creator: Adobe Systems Incorporated
parse.cp:subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 
parse.pdf:docinfo:trapped: False
parse.meta:creation-date: 2004-09-24T14:56:30Z
parse.xmpMM:DerivedFrom:InstanceID: de7d50af-0fc1-11d9-b0d4-cd42e793ca90
parse.Last-Modified: 2004-09-29T20:21:18Z
parse.Content-Type: application/pdf
parse.description: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation). 

Ваш конф должен содержать что-то вроде

  indexer.md.mapping:
  - parse.title=title
  - parse.Author=author

Как вы можете догадаться из кода тестового примера, вам нужно добавить файл во внешний файл / tika / src / test / resources / и сослаться на имя файла в тестовом коде, как в примере с about_metadata.pdf. ниже

 @Test
public void testMetadata() throws IOException {

    bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(),
            new OutputCollector(output));

    parse("https://www.adobe.com/digitalimag/pdfs/about_metadata.pdf",
            "about_metadata.pdf");

    List<List<Object>> outTuples = output.getEmitted();

    // single document
    Assert.assertEquals(1, outTuples.size());
    // metadata
    Metadata md = (Metadata) outTuples.get(0).get(2);
    Assert.assertTrue(
            md.getFirstValue("parse.pdf:docinfo:subject").contains(
                    "By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient."));

}

ОБНОВИТЬ

При ближайшем рассмотрении проблема исходит от вашего потока. Болт перенаправления отправляет кортеж Тике по индивидуальному заказу под названием "тика". Поэтому определение должно быть

from: "redirection_bolt"
to: "parser_bolt"
grouping:
  type: LOCAL_OR_SHUFFLE
  streamId: "tika"
Другие вопросы по тегам