Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cheap isFill check and add --skip-filled-tiles option #234

Merged
merged 7 commits into from
Jun 4, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ public class VectorTile {
private static final int EXTENT = 4096;
private static final double SIZE = 256d;
private final Map<String, Layer> layers = new LinkedHashMap<>();
private Boolean isFill = null;

private static int[] getCommands(Geometry input, int scale) {
var encoder = new CommandEncoder(scale);
Expand Down Expand Up @@ -495,6 +496,23 @@ public byte[] encode() {
return tile.build().toByteArray();
}

/** Returns true if this tile contains only polygon fills (for example, the middle of the ocean). */
public boolean containsOnlyPolygonFills() {
if (isFill == null) {
boolean empty = true;
for (var layer : layers.values()) {
for (var feature : layer.encodedFeatures) {
empty = false;
if (!feature.geometry.isFill()) {
return false;
msbarry marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
isFill = !empty;
}
return isFill;
}

private enum Command {
MOVE_TO(1),
LINE_TO(2),
Expand Down Expand Up @@ -566,6 +584,79 @@ public String toString() {
"], geomType=" + geomType +
" (" + geomType.asByte() + ")]";
}

/** Returns true if the encoded geometry is a polygon fill. */
public boolean isFill() {
if (geomType != GeometryType.POLYGON) {
return false;
}

int extent = EXTENT << scale;
int firstX = 0;
int firstY = 0;
int x = 0;
int y = 0;

int geometryCount = commands.length;
int length = 0;
int command = 0;
int i = 0;
while (i < geometryCount) {

if (length <= 0) {
length = commands[i++];
command = length & ((1 << 3) - 1);
length = length >> 3;
}

if (length > 0) {
if (command == Command.CLOSE_PATH.value) {
if (doesSegmentCrossTile(x, y, firstX, firstY, extent)) {
return false;
}
length--;
continue;
}

int dx = commands[i++];
int dy = commands[i++];

length--;

dx = zigZagDecode(dx);
dy = zigZagDecode(dy);

int nextX = x + dx;
int nextY = y + dy;

if (command == Command.MOVE_TO.value) {
firstX = nextX;
firstY = nextY;
if (isInsideTile(firstX, firstY, extent)) {
return false;
}
} else if (doesSegmentCrossTile(x, y, nextX, nextY, extent)) {
return false;
}
y = nextY;
x = nextX;
}

}

return true;
}

private static boolean isInsideTile(int x, int y, int extent) {
return x >= 0 && x <= extent && y >= 0 && y <= extent;
}

private static boolean doesSegmentCrossTile(int x1, int y1, int x2, int y2, int extent) {
return (y1 >= 0 || y2 >= 0) &&
(y1 <= extent || y2 <= extent) &&
(x1 >= 0 || x2 >= 0) &&
(x1 <= extent || x2 <= extent);
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ public record PlanetilerConfig(
double simplifyToleranceAtMaxZoom,
double simplifyToleranceBelowMaxZoom,
boolean osmLazyReads,
boolean compactDb
boolean compactDb,
boolean skipFilledTiles
) {

public static final int MIN_MINZOOM = 0;
Expand Down Expand Up @@ -142,6 +143,9 @@ public static PlanetilerConfig from(Arguments arguments) {
false),
arguments.getBoolean("compact_db",
"Reduce the DB size by separating and deduping the tile data",
false),
arguments.getBoolean("skip_filled_tiles",
"Skip writing tiles containing only polygon fills to the output",
false)
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ public class MbtilesWriter {
private static final Logger LOGGER = LoggerFactory.getLogger(MbtilesWriter.class);
private static final long MAX_FEATURES_PER_BATCH = 10_000;
private static final long MAX_TILES_PER_BATCH = 1_000;
private static final int MAX_FEATURES_HASHING_THRESHOLD = 5;
private final Counter.Readable featuresProcessed;
private final Counter memoizedTiles;
private final Mbtiles db;
Expand Down Expand Up @@ -258,7 +257,9 @@ private void tileEncoder(Iterable<TileBatch> prev, Consumer<TileBatch> next) thr
*/
byte[] lastBytes = null, lastEncoded = null;
Integer lastTileDataHash = null;
boolean lastIsFill = false;
boolean compactDb = config.compactDb();
boolean skipFilled = config.skipFilledTiles();

for (TileBatch batch : prev) {
Queue<TileEncodingResult> result = new ArrayDeque<>(batch.size());
Expand All @@ -270,23 +271,30 @@ private void tileEncoder(Iterable<TileBatch> prev, Consumer<TileBatch> next) thr
byte[] bytes, encoded;
Integer tileDataHash;
if (tileFeatures.hasSameContents(last)) {
if (skipFilled && lastIsFill) {
continue;
}
bytes = lastBytes;
encoded = lastEncoded;
tileDataHash = lastTileDataHash;
memoizedTiles.inc();
} else {
VectorTile en = tileFeatures.getVectorTileEncoder();
encoded = en.encode();
bytes = gzip(encoded);
if (skipFilled) {
lastIsFill = en.containsOnlyPolygonFills();
if (lastIsFill) {
continue;
}
}
lastEncoded = encoded = en.encode();
lastBytes = bytes = gzip(encoded);
last = tileFeatures;
lastEncoded = encoded;
lastBytes = bytes;
if (encoded.length > 1_000_000) {
LOGGER.warn("{} {}kb uncompressed",
tileFeatures.tileCoord(),
encoded.length / 1024);
}
if (compactDb && tileFeatures.getNumFeaturesToEmit() < MAX_FEATURES_HASHING_THRESHOLD) {
if (compactDb && en.containsOnlyPolygonFills()) {
Copy link
Contributor Author

@msbarry msbarry May 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bbilger what do you think of using this here instead of the feature count threshold?

The only issue I'm thinking is there are currently seams where ocean polygons overlap where any tile containing the seam won't be just fills but they will be repeated. Another option there would be to merge these ocean polygons (implemented in #235)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@msbarry hard to say w/o a benchmark for encoding but that seems a bit faster. I ran it for Australia twice on this branch and on the main branch. The downside is that the file size increases by almost 100MB since some opportunities to dedupe seem to be missed. So it depends on the goal whether to increase performance a bit or to reduce the file size.

# 1 skip-filled-tiles branch
    mbtiles          1m6s cpu:10m13s gc:4s avg:9.3
	  read    1x(22% 15s sys:2s wait:46s done:3s)
	  encode 11x(68% 45s wait:12s done:3s)
	  write   1x(57% 38s sys:4s wait:23s)
----------------------------------------
	features	3.6GB
	mbtiles	1.8GB

# 2 skip-filled-tiles branch
	mbtiles          1m7s cpu:10m23s gc:4s avg:9.3
	  read    1x(21% 14s sys:1s wait:46s done:3s)
	  encode 11x(68% 46s wait:12s done:3s)
	  write   1x(56% 38s sys:4s wait:24s)
----------------------------------------
	features	3.6GB
	mbtiles	1.8GB

# 3 main branch

	mbtiles          1m8s cpu:10m48s gc:4s avg:9.5
	  read    1x(10% 7s wait:58s done:3s)
	  encode 11x(72% 49s wait:10s done:3s)
	  write   1x(55% 37s sys:3s wait:26s)
----------------------------------------
	features	3.6GB
	mbtiles	1.7GB

# 4 main branch
	mbtiles          1m9s cpu:10m50s gc:4s avg:9.4
	  read    1x(10% 7s wait:58s done:3s)
	  encode 11x(72% 49s wait:11s done:3s)
	  write   1x(54% 37s sys:3s wait:26s)
----------------------------------------
	features	3.6GB
	mbtiles	1.7GB

Copy link
Contributor Author

@msbarry msbarry May 30, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops, part of this might that I landed #235 but didn't merge it into this branch until just now - the water polygon seams add quite a bit of size and merging them in encode adds a bit of extra time

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay, after rebasing the results are almost identical, so don't have any concerns then.

# 5 skip-filled-tiles branch rebased
    mbtiles          1m9s cpu:10m54s gc:4s avg:9.5
	  read    1x(9% 6s wait:58s done:3s)
	  encode 11x(72% 49s wait:10s done:3s)
	  write   1x(54% 37s sys:3s wait:26s)
----------------------------------------
	features	3.6GB
	mbtiles	1.7GB

# 6 skip-filled-tiles branch rebased
	mbtiles          1m9s cpu:10m50s gc:4s avg:9.4
	  read    1x(10% 7s wait:59s done:3s)
	  encode 11x(71% 49s wait:11s done:3s)
	  write   1x(53% 37s sys:3s wait:26s)
----------------------------------------
	features	3.6GB
	mbtiles	1.7GB

(it doesn't matter at all but just to mention: file size is 10MB larger with containsOnlyPolygonFills)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could use num features < 5 OR containsOnlyPolygonFills. On Australia, there are ~40k unique fill tiles, but 800k unique tiles with < 5 features. It's probably worth the slight file size increase to lower the risk of a hash collision...

Copy link
Contributor

@bbilger bbilger Jun 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cntFilled     =  636515
cntLt5        = 1422225
ctFilledLt5   =  635047
cntFilledGte5 =    1468
Counted in #tileEncoder like so
...
if (en.containsOnlyPolygonFills()) {
  cntFilled.incrementAndGet();
}
if (tileFeatures.getNumFeaturesToEmit() < 5) {
  cntLt5.incrementAndGet();
}
if (en.containsOnlyPolygonFills() && tileFeatures.getNumFeaturesToEmit() < 5) {
  ctFilledLt5.incrementAndGet();
}
if (en.containsOnlyPolygonFills() && tileFeatures.getNumFeaturesToEmit() >= 5) {
  cntFilledGte5.incrementAndGet();
}
if (compactDb && en.containsOnlyPolygonFills()) {
  tileDataHash = tileFeatures.generateContentHash();
} else {
  tileDataHash = null;
}
...
# containsOnlyPolygonFills
select count(*) from tile_data; -- 1290598

# tileFeatures.getNumFeaturesToEmit() < MAX_FEATURES_HASHING_THRESHOLD
select count(*) from tile_data; -- 1254643

To start with, I don't fully understand the motivation for replacing tileFeatures.getNumFeaturesToEmit() < MAX_FEATURES_HASHING_THRESHOLD by containsOnlyPolygonFills. The threshold check seems simple enough. But I assume you want to avoid unnecessary hash code generation & lookups.

So on the one hand: with the threshold approach we hash twice, and more than required but I think it should be fast enough, and we can remove a few more dupes (~10MB less).
On the other hand with containsOnlyPolygonFills we miss a bunch of dupes, and this check also comes at a cost - but w/o a benchmark it's hard to say which is better/worse - would assume both to be more or less identical overall.

So I'd tend to just keep tileFeatures.getNumFeaturesToEmit() < MAX_FEATURES_HASHING_THRESHOLD. But if avoiding hash generation/lookup is the main motivation and since the few missed dupes are neglectable, I would go with if (compactDb && tileFeatures.getNumFeaturesToEmit() < MAX_FEATURES_HASHING_THRESHOLD && en.containsOnlyPolygonFills()) (i.e. first check threshold to avoid uneccessary #containsOnlyPolygonFills checks)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thought here was that "is fill" would be a more accurate indicator for whether a tile would be repeated a lot than the feature count heuristic. For example some future tileset where there are >5 polygons overlapping for a large portion of the planet.

For Australia (16,169,927 tiles), it looks like:

# features # tiles repeated tiles repeated (but not fill) tiles
1 13,331,155 13,101,957 6,062
2 1,681,737 1,400,406 7,283
3 583,603 406,630 17,596
4 140,253 17,113 4,566
5 65,943 107 103
6 47,374 15 15
7 36,311 8 8
8 28,772 5 5
9 23,901 4 4
>=10 230,878 31 31

So the isFill check catches 99.8% of repeated tiles and there end up being 41,350 distinct repeated tiles, but the <5 heuristic catches nearly 100% of repeated tiles, but there end up being 810,670 (20x more) distinct tiles. So I guess it's a tradeoff of whether we'd rather get the additional 0.2% to lower the mbtiles size slightly or if we want to minimize the chances of a hash collision by deduping 20x fewer tile hashes.

Copy link
Contributor

@bbilger bbilger Jun 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, thanks for the additional explanation and crunching more numbers.

So given that the "is fill"-check should be cheap&fast like you said, and

For example some future tileset where there are >5 polygons overlapping for a large portion of the planet.

and

isFill check catches 99.8% of repeated tiles

and

minimize the chances of a hash collision

are all very good arguments to me to go with your suggestion to use "is fill" instead - and keep the #features out of the equation entirely here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK thanks, I was trying to decide whether isFill() || numFeatures() <= N would be better to catch more repeated tiles, but even just using the lowest threshold of 1 increases # distinct tiles being hashed to 267k (6-7x more) and only catches 17% of the remaining tiles, so it's probably not worth it.

If we wanted to catch more of the repeated tiles, it would probably be better to expand the isFill logic to also identify if the tile contains only edge of a rectangle or segment of a vertical/horizontal line (for example if a tileset includes latitude/longitude graticules).

given that the "is fill"-check should be cheap&fast like you said

I double checked, visualvm shows 100ms on my macbook for australia and it's only taking ~100ms

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like the "isEdge" check on lines and rectangle catches almost all of the other repeated tiles and only increases # distinct tiles to track to 53,491 (30% increase), so I'll add that in to be safe and cover future cases with graticules.

tileDataHash = tileFeatures.generateContentHash();
} else {
tileDataHash = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,25 @@ void testFullWorldPolygon() throws Exception {
)).stream().map(d -> d.geometry().geom().norm()).toList());
}

@Test
void testSkipFill() throws Exception {
var results = runWithReaderFeatures(
Map.of("threads", "1", "skip-filled-tiles", "true"),
List.of(
newReaderFeature(WORLD_POLYGON, Map.of())
),
(in, features) -> features.polygon("layer")
.setZoomRange(0, 6)
.setBufferPixels(4)
);

assertEquals(481, results.tiles.size());
// spot-check one filled tile does not exist
assertNull(results.tiles.get(TileCoord.ofXYZ(
Z4_TILES / 2, Z4_TILES / 2, 4
)));
}

@ParameterizedTest
@CsvSource({
"chesapeake.wkb, 4076",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import static com.onthegomap.planetiler.TestUtils.*;
import static com.onthegomap.planetiler.geo.GeoUtils.JTS_FACTORY;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotSame;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.DynamicTest.dynamicTest;
Expand All @@ -33,6 +34,8 @@
import org.junit.jupiter.api.DynamicTest;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestFactory;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import org.locationtech.jts.geom.Coordinate;
import org.locationtech.jts.geom.CoordinateXY;
import org.locationtech.jts.geom.Geometry;
Expand Down Expand Up @@ -333,6 +336,54 @@ void testMultipleFeaturesMultipleLayer() {
assertEquals("layer2", decoded.get(2).layer());
}

@ParameterizedTest
@CsvSource({
"true,-1,-1,257,257",
"true,-5,-5,260,260",
"false,-1,-1,254,254",
"false,0,-1,257,257",
"false,-1,0,257,257",
"false,-1,-1,256,257",
"false,-1,-1,257,256",

"false,0,0,1,1",
"false,1,1,2,2",
"false,1,1,2,2",
})
void testIsFill(boolean isFill, double x1, double y1, double x2, double y2) {
for (int scale = 0; scale < 4; scale++) {
assertEquals(isFill, VectorTile.encodeGeometry(rectangle(x1, y1, x2, y2), scale).isFill(), "scale=" + scale);
}
}

@Test
void testCrossBoundaryNotFill() {
assertFalse(VectorTile.encodeGeometry(newPolygon(
-1, -1,
257, -1,
-1, 257,
-1, -1
), 0).isFill());
assertFalse(VectorTile.encodeGeometry(newPolygon(
257, -1,
-1, 257,
-1, -1,
257, -1
), 0).isFill());
assertFalse(VectorTile.encodeGeometry(newPolygon(
-1, 257,
-1, -1,
257, -1,
-1, 257
), 0).isFill());
assertFalse(VectorTile.encodeGeometry(newPolygon(
-1, -1,
513, -1,
-1, 513,
-1, -1
), 1).isFill());
}

private void testRoundTripAttrs(Map<String, Object> attrs) {
testRoundTrip(JTS_FACTORY.createPoint(new CoordinateXY(0, 0)), "layer", attrs, 1);
}
Expand Down