Stormcrawler: Apache Tika для анализа свойств PDF
Я добавил Tika в качестве ссылки на мою реализацию StormCrawler, и это позволяет извлекать документы PDF в сканере. Но заголовок, авторы и другие свойства не анализируются. Я попытался с различными комбинациями к index.md.mapping: и добавил соответствующие свойства в ES_IndexInit, но поле содержимого в Kibana (index) для документов PDF всегда пусто. Все работает для HTML-страниц. Можете ли вы помочь с некоторыми указателями, если я что-то упустил, или я могу посмотреть на пример?
эс-crawler.flux:
name: "crawler"
includes: - resource: true file: "/crawler-default.yaml" override: false
- resource: false file: "crawler-conf.yaml" override: true - resource: false file: "es-conf.yaml" override: true
spouts: - id: "spout" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout" параллелизм: 10
bolts: - id: "разделитель" className: "com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt" параллелизм: 1 - id: "fetcher" className: "com.digitalpebble.stormcrawler.bolt.FetcherBolt" параллелизм: 1 - id: " карта сайта " className: "com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt"параллелизм: 1 - id: "parse" className: "com.digitalpebble.stormcrawler.bolt.JSoupParserBolt"параллелизм: 5 - id: " com "classN.digitalpebble.stormcrawler.elasticsearch.bolt.IndexerBolt"параллелизм: 1 - id: "status" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt"параллелизм: 1 - id: status_metble comigame" className.stormcrawler.elasticsearch.metrics.StatusMetricsBolt"параллелизм: 4 - id: "redirection_bolt" className: "com.digitalpebble.stormcrawler.tika.RedirectionBolt"параллелизм: 1 - id: "parser_bolt" className: "com.st..ParserBolt"параллелизм: 1
потоки: - из: "spout" в: "разделитель" группировка: тип: SHUFFLE
от: "spout" до: "status_metrics" группировка: тип: SHUFFLE
from: "partitioner" to: "fetcher" группировка: тип: FIELDS args: ["key"]
от: "fetcher" до: "sitemap" группировка: тип: LOCAL_OR_SHUFFLE
от: "sitemap" до: "parse" группировки: тип: LOCAL_OR_SHUFFLE
от: "разбирать" до: "индексировать" группировка: тип: LOCAL_OR_SHUFFLE
from: "fetcher" to: "status" группировка: тип: FIELDS args: ["url"] streamId: "status"
from: "sitemap" to: "status" группировка: тип: FIELDS args: ["url"] streamId: "status"
from: "parse" to: "status" группировка: тип: FIELDS args: ["url"] streamId: "status"
from: "index" to: "status" группировка: тип: FIELDS args: ["url"] streamId: "status"
от: "parse" до: "redirection_bolt" группировка: тип: LOCAL_OR_SHUFFLE
от: "redirection_bolt" до: "parser_bolt" группировка: тип: LOCAL_OR_SHUFFLE
от: "redirection_bolt" до: "индекс" группировка: тип: LOCAL_OR_SHUFFLE
от: "parser_bolt" до: "индекс" группировка: тип: LOCAL_OR_SHUFFLE
эс-injector.flux:
name: "injector"
includes: - resource: true file: "/crawler-default.yaml" override: false
- resource: false file: "crawler-conf.yaml" override: true - resource: false file: "es-conf.yaml" override: true - resource: false file: "injection-conf.yaml" override: true
Компоненты: - id: "схема" className: "com.digitalpebble.stormcrawler.util.StringTabScheme" constructorArgs: - ОБНАРУЖЕНО
spouts: - id: "spout" className: "com.digitalpebble.stormcrawler.spout.FileSpout" параллелизм: 1 constructorArgs: - "." - "seed.txt" - ref: "схема"
bolts: - id: "status" className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt" параллелизм: 1 - id: "parser_bolt" className: "com.digitalpebble.stormcrawler.tika.ParserBolt" параллелизм 1:
потоки: - от: "spout" до: "статус" группировка: тип: FIELDS args: ["url"]
pom.xml:
http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion> <groupId>xyz.com</groupId> <artifactId>search</artifactId> <version>search1.0</version> <packaging>jar</packaging> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.2</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> <plugin> <groupId>org.codehaus.mojo</groupId> <artifactId>exec-maven-plugin</artifactId> <version>1.3.2</version> <executions> <execution> <goals> <goal>exec</goal> </goals> </execution> </executions> <configuration> <executable>java</executable> <includeProjectDependencies>true</includeProjectDependencies> <includePluginDependencies>false</includePluginDependencies> <classpathScope>compile</classpathScope> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>1.3.3</version> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <createDependencyReducedPom>false</createDependencyReducedPom> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" /> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <mainClass>org.apache.storm.flux.Flux</mainClass> <manifestEntries> <Change></Change> <Build-Date></Build-Date> </manifestEntries> </transformer> </transformers> <!-- The filters below are necessary if you want to include the Tika module --> <filters> <filter> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> </configuration> </execution> </executions> </plugin> </plugins> </build> <dependencies> <dependency> <groupId>org.apache.storm</groupId> <artifactId>storm-core</artifactId> <version>1.1.1</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.storm</groupId> <artifactId>flux-core</artifactId> <version>1.0.2</version> </dependency> <dependency> <groupId>com.digitalpebble.stormcrawler</groupId> <artifactId>storm-crawler-core</artifactId> <version>1.7</version> </dependency> <dependency> <groupId>com.digitalpebble.stormcrawler</groupId> <artifactId>storm-crawler-elasticsearch</artifactId> <version>1.7</version> </dependency> <dependency> <groupId>com.digitalpebble.stormcrawler</groupId> <artifactId>storm-crawler-tika</artifactId> <version>1.7</version> </dependency> </dependencies>
1 ответ
Ваши файлы pom и flux выглядят хорошо. Вы можете поместить инъекцию как часть основного потока, чтобы все было просто.
Что находится в crawler-conf.yaml? Вы добавили префиксы к именам полей?
Вот метаданные, извлеченные из URL, который вы разместили выше
parse.dcterms:modified: 2004-09-29T20:21:18Z
parse.pdf:PDFVersion: 1.4
parse.access_permission:can_print: true
parse.pdf:docinfo:subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation).
parse.pdf:docinfo:modified: 2004-09-29T20:21:18Z
parse.access_permission:extract_for_accessibility: true
parse.created: Fri Sep 24 15:56:30 BST 2004
parse.pdf:docinfo:created: 2004-09-24T14:56:30Z
parse.xmpTPg:NPages: 7
parse.access_permission:fill_in_form: true
parse.producer: Adobe PDF Library 6.0
parse.pdf:docinfo:title: About Metadata
parse.pdf:docinfo:producer: Adobe PDF Library 6.0
parse.dc:format: application/pdf; version=1.4
parse.access_permission:assemble_document: true
parse.access_permission:modify_annotations: true
parse.dc:title: About Metadata
parse.access_permission:can_print_degraded: true
parse.xmpMM:DocumentID: adobe:docid:indd:de7d50b0-0fc1-11d9-b0d4-cd42e793ca90
parse.xmpMM:DerivedFrom:DocumentID: adobe:docid:indd:a04d199f-0f11-11d9-b74d-bb0abf4f1ab0
parse.title: About Metadata
parse.Creation-Date: 2004-09-24T14:56:30Z
parse.modified: 2004-09-29T20:21:18Z
parse.resourceName: /digitalimag/pdfs/about_metadata.pdf
parse.dc:description: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation).
parse.Last-Save-Date: 2004-09-29T20:21:18Z
parse.creator: Adobe Systems Incorporated
parse.pdf:encrypted: false
parse.trapped: False
parse.pdf:docinfo:creator: Adobe Systems Incorporated
parse.date: 2004-09-29T20:21:18Z
parse.meta:save-date: 2004-09-29T20:21:18Z
parse.Author: Adobe Systems Incorporated
parse.X-Parsed-By: org.apache.tika.parser.DefaultParser
parse.X-Parsed-By: org.apache.tika.parser.pdf.PDFParser
parse.pdf:docinfo:creator_tool: Adobe InDesign CS (3.0.1)
parse.dcterms:created: 2004-09-24T14:56:30Z
parse.access_permission:can_modify: true
parse.subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation).
parse.meta:author: Adobe Systems Incorporated
parse.access_permission:extract_content: true
parse.xmp:CreatorTool: Adobe InDesign CS (3.0.1)
parse.dc:creator: Adobe Systems Incorporated
parse.cp:subject: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation).
parse.pdf:docinfo:trapped: False
parse.meta:creation-date: 2004-09-24T14:56:30Z
parse.xmpMM:DerivedFrom:InstanceID: de7d50af-0fc1-11d9-b0d4-cd42e793ca90
parse.Last-Modified: 2004-09-29T20:21:18Z
parse.Content-Type: application/pdf
parse.description: By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient. A wide variety of industries use metadata, but for the purposes of digital imaging, there are currently only a few technical structures or schema that are being employed. A schema is a set of properties and their defined meanings, such as the type of value (date, size, URL, or any useful designation).
Ваш конф должен содержать что-то вроде
indexer.md.mapping:
- parse.title=title
- parse.Author=author
Как вы можете догадаться из кода тестового примера, вам нужно добавить файл во внешний файл / tika / src / test / resources / и сослаться на имя файла в тестовом коде, как в примере с about_metadata.pdf. ниже
@Test
public void testMetadata() throws IOException {
bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(),
new OutputCollector(output));
parse("https://www.adobe.com/digitalimag/pdfs/about_metadata.pdf",
"about_metadata.pdf");
List<List<Object>> outTuples = output.getEmitted();
// single document
Assert.assertEquals(1, outTuples.size());
// metadata
Metadata md = (Metadata) outTuples.get(0).get(2);
Assert.assertTrue(
md.getFirstValue("parse.pdf:docinfo:subject").contains(
"By simple definition, metadata is data about data. Metadata is structured information that explains, describes, or locates the original primary data, or that otherwise makes using the original primary data more efficient."));
}
ОБНОВИТЬ
При ближайшем рассмотрении проблема исходит от вашего потока. Болт перенаправления отправляет кортеж Тике по индивидуальному заказу под названием "тика". Поэтому определение должно быть
from: "redirection_bolt"
to: "parser_bolt"
grouping:
type: LOCAL_OR_SHUFFLE
streamId: "tika"