Частота фразы с использованием TermFreq
Я пытаюсь использовать termFreq()
функция с фразой, что-то вроде
termfreq(field, "cool phrase")
я использую shingleFilter
так что он может сгруппировать слова и найти результаты как один термин. Когда я использую это так:
termfreq(field,phrase)
слово "фраза" будет обрабатываться всеми фильтрами, введенными для поля, например
stemming
lowercase
stopwords
- и т.п.
Когда я использую фразу, если я не укажу точный термин, она не работает. Примером будет что-то вроде:
termFreq(field, "cool phrase") -> x
termFreq(field, "cooL PHRASE") -> y
это ошибка?
РЕДАКТИРОВАТЬ:
Моя схема.xml
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="acordaoDocumentSchema" version="1.5">
<fields>
<field name="_version_" type="string" indexed="true" stored="true" multiValued="false" />
<field name="chave" type="string" stored="true" indexed="true" />
<field name="cdAcordao" type="string" stored="true" indexed="true" />
<field name="nuRegistro" type="texto_indexado" indexed="true" stored="true" />
<field name="deInteiroTeor" type="texto_indexado" indexed="true" stored="false" multiValued="true" termVectors="true" termPositions="true" termOffsets="true"/>
<field name="deEmenta" type="texto_indexado" stored="true" indexed="true" />
<field name="deEmentaParaFacetas" type="texto_para_facetas" stored="false" indexed="true" />
<field name="dtJulgamento" type="date" stored="true" indexed="true" />
<field name="dtRegistro" type="date" stored="true" indexed="true" />
<field name="dtAtualizacao" type="date" stored="true" />
<field name="flJurisprudencia" type="string" stored="false" indexed="true" />
<field name="flSegredoJustica" type="string" stored="false" indexed="true" />
<field name="flMostraInternet" type="string" stored="false" indexed="true" />
<field name="flAtivo" type="string" stored="false" indexed="true" />
<field name="flTpDecisao" type="string" stored="false" indexed="true" />
<field name="cdAgente" type="string" stored="true" indexed="true" />
<field name="cdAgenteForo" type="string" stored="true" indexed="true" />
<field name="cdJuizProlator" type="string" stored="true" indexed="true" />
<field name="cdComarca" type="string" stored="true" indexed="true" />
<field name="cdOrgaoJulgador" type="string" stored="true" indexed="true" />
<field name="cdForo" type="string" stored="true" indexed="true" />
<field name="cdVara" type="string" stored="true" indexed="true" />
<field name="cdClasse" type="string" stored="true" indexed="true" />
<field name="cdAssuntoPrinc" type="string" stored="true" indexed="true" />
<field name="nuProcOrigem" type="texto_numero_processo" indexed="true" stored="true" />
<field name="nuProcesso" type="texto_numero_processo" stored="true" indexed="true" />
</fields>
<uniqueKey>chave</uniqueKey>
<copyField source="deEmenta" dest="deEmentaParaFacetas" />
<copyField source="nuRegistro" dest="deInteiroTeor" />
<copyField source="nuProcOrigem" dest="deInteiroTeor" />
<copyField source="nuProcesso" dest="deInteiroTeor" />
<types>
<fieldType name="texto_indexado" class="solr.TextField" omitNorms="false">
<analyzer type="index">
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="(\r)" replacement=" "/>
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="(\r\n)" replacement=" "/>
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="(\n)" replacement=" "/>
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.PortugueseLightStemFilterFactory" />
<filter class="solr.ShingleFilterFactory" maxShingleSize="4" outputUnigrams="true"/>
</analyzer>
<analyzer type="query">
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="(\r)" replacement=" "/>
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="(\r\n)" replacement=" "/>
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="(\n)" replacement=" "/>
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.PortugueseLightStemFilterFactory" />
</analyzer>
</fieldType>
<fieldType name="texto_numero_processo" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.PatternReplaceFilterFactory" pattern="[^a-zA-Z0-9 ]" replacement="" replace="all"/>
</analyzer>
</fieldType>
<fieldType name="texto_para_facetas" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.StopFilterFactory" ignoreCase="true" words="../../conf/stopwords.txt" format="snowball" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" splitOnCaseChange="0" splitOnNumerics="0" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="1" />
<filter class="solr.LengthFilterFactory" min="4" max="200" />
</analyzer>
</fieldType>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" />
<fieldType name="date" class="solr.DateField" sortMissingLast="true" omitNorms="true"/>
</types>