fix/hack remaining filter and analysis issues

This commit is contained in:
Robert Muir 2014-10-29 01:00:15 -04:00
parent df53448856
commit 0f8740a782
7 changed files with 58 additions and 27 deletions

View file

@ -23,6 +23,7 @@ import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator;
@ -81,11 +82,10 @@ public class DocIdSets {
}
// TODO: should we use WAH8DocIdSet like Lucene?
FixedBitSet fixedBitSet = new FixedBitSet(reader.maxDoc());
do {
fixedBitSet.set(doc);
doc = it.nextDoc();
} while (doc != DocIdSetIterator.NO_MORE_DOCS);
return new BitDocIdSet(fixedBitSet);
it = set.iterator();
long cost = it.cost();
fixedBitSet.or(it);
return new BitDocIdSet(fixedBitSet, cost);
}
/**
@ -114,4 +114,29 @@ public class DocIdSets {
set.or(iterator);
return set;
}
/**
* Creates a new DocIDSet if you have no idea of the cardinality,
* and are afraid of the cost of computing the cost.
* @deprecated remove usages of this.
*/
@Deprecated
public static BitDocIdSet newDocIDSet(BitSet bs) {
final int cost;
if (bs instanceof FixedBitSet) {
cost = guessCost((FixedBitSet) bs);
} else {
cost = bs.approximateCardinality();
}
return new BitDocIdSet(bs, cost);
}
// nocommit: we should instead base this always on cost of clauses and stuff???
private static int guessCost(FixedBitSet bs) {
if (bs.length() < 8192) {
return bs.cardinality();
} else {
return bs.length() / 8192 * new FixedBitSet(bs.getBits(), 8192).cardinality();
}
}
}

View file

@ -25,6 +25,7 @@ import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.FixedBitSet;
import org.elasticsearch.common.lucene.docset.AllDocIdSet;
import org.elasticsearch.common.lucene.docset.DocIdSets;
@ -179,7 +180,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
if (!hasBits) {
if (!fastOrClauses.isEmpty()) {
DocIdSetIterator it = res.iterator();
DocIdSetIterator it = new BitSetIterator(res, 0);
at_least_one_should_clause_iter:
for (int setDoc = it.nextDoc(); setDoc != DocIdSetIterator.NO_MORE_DOCS; setDoc = it.nextDoc()) {
for (ResultClause fastOrClause : fastOrClauses) {
@ -199,7 +200,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
if (hasShouldClauses && !hasNonEmptyShouldClause) {
return null;
} else {
return res;
return DocIdSets.newDocIDSet(res);
}
}
@ -244,7 +245,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
} else {
Bits bits = clause.bits;
// use the "res" to drive the iteration
DocIdSetIterator it = res.iterator();
DocIdSetIterator it = new BitSetIterator(res, 0);
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
if (!bits.get(doc)) {
res.clear(doc);
@ -262,7 +263,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
} else {
Bits bits = clause.bits;
// let res drive the iteration
DocIdSetIterator it = res.iterator();
DocIdSetIterator it = new BitSetIterator(res, 0);
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
if (bits.get(doc)) {
res.clear(doc);
@ -277,7 +278,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
// clause must match in order for a doc to be a match. What we do here is checking if matched docs match with
// any should filter. TODO: Add an option to have disable minimum_should_match=1 behaviour
if (!slowOrClauses.isEmpty() || !fastOrClauses.isEmpty()) {
DocIdSetIterator it = res.iterator();
DocIdSetIterator it = new BitSetIterator(res, 0);
at_least_one_should_clause_iter:
for (int setDoc = it.nextDoc(); setDoc != DocIdSetIterator.NO_MORE_DOCS; setDoc = it.nextDoc()) {
for (ResultClause fastOrClause : fastOrClauses) {
@ -303,7 +304,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
if (hasShouldClauses && !hasNonEmptyShouldClause) {
return null;
} else {
return res;
return DocIdSets.newDocIDSet(res);
}
}

View file

@ -91,8 +91,10 @@ public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
public TokenStream create(TokenStream tokenStream) {
if (version.onOrAfter(Version.LUCENE_4_4)) {
return new KeepWordFilter(tokenStream, keepWords);
} else {
// nocommit: what happened here?
throw new UnsupportedOperationException();
}
return new KeepWordFilter(version, enablePositionIncrements, tokenStream, keepWords);
}

View file

@ -57,7 +57,9 @@ public class LengthTokenFilterFactory extends AbstractTokenFilterFactory {
public TokenStream create(TokenStream tokenStream) {
if (version.onOrAfter(Version.LUCENE_4_4)) {
return new LengthFilter(tokenStream, min, max);
} else {
// nocommit: what happened here?
throw new UnsupportedOperationException();
}
return new LengthFilter(version, enablePositionIncrements, tokenStream, min, max);
}
}

View file

@ -64,8 +64,11 @@ public class StopTokenFilterFactory extends AbstractTokenFilterFactory {
@Override
public TokenStream create(TokenStream tokenStream) {
if (removeTrailing) {
StopFilter filter = new StopFilter(version, tokenStream, stopWords);
filter.setEnablePositionIncrements(enablePositionIncrements);
StopFilter filter = new StopFilter(tokenStream, stopWords);
if (enablePositionIncrements == false) {
// nocommit: what happened here?
throw new UnsupportedOperationException();
}
return filter;
} else {
return new SuggestStopFilter(tokenStream, stopWords);

View file

@ -19,14 +19,12 @@
package org.elasticsearch.index.search.geo;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.BitDocIdSet;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.geo.GeoPoint;
import org.elasticsearch.common.lucene.docset.DocIdSets;
@ -81,9 +79,9 @@ public class IndexedGeoBoundingBoxFilter {
}
} else {
if (main == null) {
main = (FixedBitSet) set;
main = ((BitDocIdSet) set).bits();
} else {
main.or((FixedBitSet) set);
main.or(set.iterator());
}
}
@ -92,7 +90,7 @@ public class IndexedGeoBoundingBoxFilter {
return null;
}
main.and(set.iterator());
return main;
return DocIdSets.newDocIDSet(main);
}
@Override
@ -129,19 +127,19 @@ public class IndexedGeoBoundingBoxFilter {
}
@Override
public FixedBitSet getDocIdSet(LeafReaderContext context, Bits acceptedDocs) throws IOException {
FixedBitSet main;
public BitDocIdSet getDocIdSet(LeafReaderContext context, Bits acceptedDocs) throws IOException {
BitSet main;
DocIdSet set = lonFilter.getDocIdSet(context, acceptedDocs);
if (DocIdSets.isEmpty(set)) {
return null;
}
main = (FixedBitSet) set;
main = ((BitDocIdSet) set).bits();
set = latFilter.getDocIdSet(context, acceptedDocs);
if (DocIdSets.isEmpty(set)) {
return null;
}
main.and(set.iterator());
return main;
return DocIdSets.newDocIDSet(main);
}
@Override

View file

@ -75,7 +75,6 @@ public class StopTokenFilterTests extends ElasticsearchTokenStreamTestCase {
tokenizer.setReader(new StringReader("foo bar"));
TokenStream create = tokenFilter.create(tokenizer);
assertThat(create, instanceOf(StopFilter.class));
assertThat(((StopFilter)create).getEnablePositionIncrements(), equalTo(true));
}
@Test
@ -90,7 +89,8 @@ public class StopTokenFilterTests extends ElasticsearchTokenStreamTestCase {
tokenizer.setReader(new StringReader("foo bar"));
TokenStream create = tokenFilter.create(tokenizer);
assertThat(create, instanceOf(StopFilter.class));
assertThat(((StopFilter)create).getEnablePositionIncrements(), equalTo(false));
// nocommit: was posInc=false actually supported in 4.3 in lucene (other than for ancient back compat?)
fail("what happened here, and what to do about it");
}
@Test