Public Access
1
0

chore: improve postal data import observability

Add progress logging and a status script for postal imports and neighbor builds, and ignore local raw and generated postal datasets.
This commit is contained in:
pguerrerox
2026-04-12 23:22:36 +00:00
parent cc00a439bf
commit dc7686f507
11 changed files with 267 additions and 23 deletions
+3
View File
@@ -7,3 +7,6 @@ coverage/
*.log *.log
.env* .env*
!.env.example !.env.example
db/rawdata/
db/datasets/postal/*.geojson
!db/datasets/postal/.gitkeep
+4 -1
View File
@@ -2,11 +2,12 @@
All notable changes to this project are documented in this file. All notable changes to this project are documented in this file.
## 2026-03-27 ## 2026-04-12
### Changed ### Changed
- Improved local development networking by making API env loading work with `.env.local`, adding LAN-friendly API URL fallback behavior, and fixing development CORS handling. - Improved local development networking by making API env loading work with `.env.local`, adding LAN-friendly API URL fallback behavior, and fixing development CORS handling.
- Fixed local research inserts so nullable Google Places coordinates no longer break business upserts. - Fixed local research inserts so nullable Google Places coordinates no longer break business upserts.
- Improved postal data tooling with streaming imports, clearer CRS validation, progress logs, and a status command for checking imported areas and adjacency counts.
### Added ### Added
- Added postal-area import and adjacency build scripts for US ZIP/ZCTA and Canada FSA datasets. - Added postal-area import and adjacency build scripts for US ZIP/ZCTA and Canada FSA datasets.
@@ -16,6 +17,8 @@ All notable changes to this project are documented in this file.
### Removed ### Removed
- Removed stale local metadata, placeholder postal seeding code, and leftover Supabase-era repository artifacts. - Removed stale local metadata, placeholder postal seeding code, and leftover Supabase-era repository artifacts.
## 2026-03-27
### Changed ### Changed
- Migrated the app from a Supabase runtime to a local Fastify API with PostgreSQL, PostGIS, and cookie-based session auth. - Migrated the app from a Supabase runtime to a local Fastify API with PostgreSQL, PostGIS, and cookie-based session auth.
- Reworked the research experience with the `Leads4less` branding, the renamed `Research` view, a top-form layout, and a filterable grid of research jobs. - Reworked the research experience with the `Leads4less` branding, the renamed `Research` view, a top-form layout, and a filterable grid of research jobs.
+15
View File
@@ -54,6 +54,21 @@ If you open the app from another machine on your LAN, set `VITE_API_BASE_URL` an
3. Import and build adjacency: 3. Import and build adjacency:
`npm run seed:postal` `npm run seed:postal`
Notes:
- Raw postal source files and generated GeoJSON datasets under `db/rawdata/` and `db/datasets/postal/*.geojson` are ignored by git.
- The importer expects WGS84 lon/lat GeoJSON. If a source file is projected, re-export it to `EPSG:4326` before importing.
Observability commands:
- `npm run import:postal` logs progress every 500 features by default.
- `npm run build:postal-neighbors` logs per-country progress and final adjacency totals.
- `npm run check:postal` prints current postal area and neighbor counts from the database.
You can change the import log interval with `POSTAL_IMPORT_LOG_EVERY`, for example:
`POSTAL_IMPORT_LOG_EVERY=1000 npm run import:postal`
If you want to validate the current postal tables after an import or neighbor build, run:
`npm run check:postal`
## Google Maps Requirements ## Google Maps Requirements
Enable these Google Cloud APIs for the keys you use: Enable these Google Cloud APIs for the keys you use:
+62 -9
View File
@@ -1,13 +1,32 @@
import { getDbPool } from '../../server/src/db/pool.js'; import { getDbPool } from '../../server/src/db/pool.js';
import { createScriptLogger } from './postal-logging.js';
async function run() { async function logPostalAreaCounts() {
const pool = getDbPool(); const pool = getDbPool();
const client = await pool.connect(); const result = await pool.query<{ country_code: string; area_count: string }>(`
select country_code, count(*)::text as area_count
from public.postal_areas
group by country_code
order by country_code asc
`);
try { return result.rows;
await client.query('begin'); }
await client.query('truncate table public.postal_area_neighbors');
await client.query(` async function buildNeighborsForCountry(countryCode: 'US' | 'CA') {
const pool = getDbPool();
const logger = createScriptLogger(`postal-neighbors:${countryCode.toLowerCase()}`);
const sourceCountResult = await pool.query<{ area_count: string }>(
`select count(*)::text as area_count from public.postal_areas where country_code = $1`,
[countryCode],
);
const areaCount = sourceCountResult.rows[0]?.area_count ?? '0';
logger.info(`Starting neighbor build for ${countryCode} with ${areaCount} postal areas.`);
await pool.query(
`
insert into public.postal_area_neighbors (postal_area_id, neighbor_postal_area_id) insert into public.postal_area_neighbors (postal_area_id, neighbor_postal_area_id)
select source.id, neighbor.id select source.id, neighbor.id
from public.postal_areas source from public.postal_areas source
@@ -15,13 +34,47 @@ async function run() {
on source.country_code = neighbor.country_code on source.country_code = neighbor.country_code
and source.id <> neighbor.id and source.id <> neighbor.id
and ST_Touches(source.geom, neighbor.geom) and ST_Touches(source.geom, neighbor.geom)
`); where source.country_code = $1
`,
[countryCode],
);
const neighborCountResult = await pool.query<{ neighbor_count: string }>(
`
select count(*)::text as neighbor_count
from public.postal_area_neighbors link
join public.postal_areas area on area.id = link.postal_area_id
where area.country_code = $1
`,
[countryCode],
);
logger.info(`Completed neighbor build for ${countryCode}. Built ${neighborCountResult.rows[0]?.neighbor_count ?? '0'} adjacency links.`);
}
async function run() {
const logger = createScriptLogger('postal-neighbors');
const pool = getDbPool();
const client = await pool.connect();
try {
logger.info('Gathering postal area counts before neighbor build.');
const counts = await logPostalAreaCounts();
counts.forEach((row) => logger.info(`Found ${row.area_count} postal areas for ${row.country_code}.`));
await client.query('begin');
logger.info('Clearing existing postal adjacency links.');
await client.query('truncate table public.postal_area_neighbors');
await client.query('commit'); await client.query('commit');
const summary = await client.query<{ count: string }>('select count(*)::text as count from public.postal_area_neighbors'); await buildNeighborsForCountry('US');
console.log(`Built ${summary.rows[0]?.count ?? '0'} postal adjacency links.`); await buildNeighborsForCountry('CA');
const summary = await pool.query<{ count: string }>('select count(*)::text as count from public.postal_area_neighbors');
logger.info(`Finished building postal neighbors. Total adjacency links: ${summary.rows[0]?.count ?? '0'}.`);
} catch (error) { } catch (error) {
await client.query('rollback'); await client.query('rollback');
logger.error('Neighbor build failed.');
throw error; throw error;
} finally { } finally {
client.release(); client.release();
+41
View File
@@ -0,0 +1,41 @@
import { getDbPool } from '../../server/src/db/pool.js';
import { createScriptLogger } from './postal-logging.js';
async function run() {
const logger = createScriptLogger('postal-status');
const pool = getDbPool();
try {
const postalAreasByCountry = await pool.query<{ country_code: string; area_count: string }>(`
select country_code, count(*)::text as area_count
from public.postal_areas
group by country_code
order by country_code asc
`);
const neighborCountsByCountry = await pool.query<{ country_code: string; neighbor_count: string }>(`
select area.country_code, count(*)::text as neighbor_count
from public.postal_area_neighbors link
join public.postal_areas area on area.id = link.postal_area_id
group by area.country_code
order by area.country_code asc
`);
const totalAreas = await pool.query<{ count: string }>('select count(*)::text as count from public.postal_areas');
const totalNeighbors = await pool.query<{ count: string }>('select count(*)::text as count from public.postal_area_neighbors');
logger.info(`Postal areas loaded: ${totalAreas.rows[0]?.count ?? '0'}`);
postalAreasByCountry.rows.forEach((row) => {
logger.info(` ${row.country_code}: ${row.area_count} postal areas`);
});
logger.info(`Postal neighbor links built: ${totalNeighbors.rows[0]?.count ?? '0'}`);
neighborCountsByCountry.rows.forEach((row) => {
logger.info(` ${row.country_code}: ${row.neighbor_count} adjacency links`);
});
} finally {
await pool.end();
}
}
await run();
+27 -5
View File
@@ -1,7 +1,8 @@
import path from 'node:path'; import path from 'node:path';
import { fileURLToPath } from 'node:url'; import { fileURLToPath } from 'node:url';
import { getDbPool } from '../../server/src/db/pool.js'; import { getDbPool } from '../../server/src/db/pool.js';
import { getFeatureGeometry, getStringProperty, normalizePostalCode, readFeatureCollection, type PostalDatasetConfig } from './postal-import-utils.js'; import { createScriptLogger } from './postal-logging.js';
import { assertGeometryIsWgs84, getFeatureGeometry, getStringProperty, normalizePostalCode, streamFeatureCollection, type PostalDatasetConfig } from './postal-import-utils.js';
const currentDir = path.dirname(fileURLToPath(import.meta.url)); const currentDir = path.dirname(fileURLToPath(import.meta.url));
const datasetsRoot = path.resolve(currentDir, '../datasets/postal'); const datasetsRoot = path.resolve(currentDir, '../datasets/postal');
@@ -23,33 +24,46 @@ const datasetConfigs: PostalDatasetConfig[] = [
}, },
]; ];
const logEvery = Math.max(1, Number.parseInt(process.env.POSTAL_IMPORT_LOG_EVERY ?? '500', 10) || 500);
async function importDataset(config: PostalDatasetConfig) { async function importDataset(config: PostalDatasetConfig) {
const pool = getDbPool(); const pool = getDbPool();
const client = await pool.connect(); const client = await pool.connect();
const logger = createScriptLogger(`postal-import:${config.countryCode.toLowerCase()}`);
let index = 0;
try { try {
const collection = await readFeatureCollection(config.filePath);
let insertedCount = 0; let insertedCount = 0;
let skippedCount = 0; let skippedCount = 0;
logger.info(`Starting ${config.label} import from ${config.filePath}`);
await client.query('begin'); await client.query('begin');
for (const [index, feature] of collection.features.entries()) { for await (const feature of streamFeatureCollection(config.filePath)) {
index += 1;
const rawPostalCode = getStringProperty(feature.properties, config.postalCodeKeys); const rawPostalCode = getStringProperty(feature.properties, config.postalCodeKeys);
if (!rawPostalCode) { if (!rawPostalCode) {
skippedCount += 1; skippedCount += 1;
if (index % logEvery === 0) {
logger.info(`Processed ${index} features, inserted ${insertedCount}, skipped ${skippedCount}`);
}
continue; continue;
} }
const normalizedPostalCode = normalizePostalCode(config.countryCode, rawPostalCode); const normalizedPostalCode = normalizePostalCode(config.countryCode, rawPostalCode);
if (!normalizedPostalCode) { if (!normalizedPostalCode) {
skippedCount += 1; skippedCount += 1;
if (index % logEvery === 0) {
logger.info(`Processed ${index} features, inserted ${insertedCount}, skipped ${skippedCount}`);
}
continue; continue;
} }
const displayName = getStringProperty(feature.properties, config.displayNameKeys) || normalizedPostalCode; const displayName = getStringProperty(feature.properties, config.displayNameKeys) || normalizedPostalCode;
const geometry = getFeatureGeometry(feature, config.filePath, index); const geometry = getFeatureGeometry(feature, config.filePath, index);
assertGeometryIsWgs84(geometry, config.filePath, index);
await client.query( await client.query(
` `
@@ -72,7 +86,7 @@ async function importDataset(config: PostalDatasetConfig) {
$4, $4,
ST_Multi(ST_SetSRID(ST_GeomFromGeoJSON($5), 4326)), ST_Multi(ST_SetSRID(ST_GeomFromGeoJSON($5), 4326)),
ST_Centroid(ST_SetSRID(ST_GeomFromGeoJSON($5), 4326))::geography, ST_Centroid(ST_SetSRID(ST_GeomFromGeoJSON($5), 4326))::geography,
greatest(1000, round(sqrt(ST_Area(ST_SetSRID(ST_GeomFromGeoJSON($5), 4326)::geography) / pi()))::integer), greatest(1000, round(sqrt(ST_Area(ST_Transform(ST_SetSRID(ST_GeomFromGeoJSON($5), 4326), 3857)) / pi()))::integer),
$6::jsonb, $6::jsonb,
now(), now(),
now() now()
@@ -98,12 +112,17 @@ async function importDataset(config: PostalDatasetConfig) {
); );
insertedCount += 1; insertedCount += 1;
if (index % logEvery === 0) {
logger.info(`Processed ${index} features, inserted ${insertedCount}, skipped ${skippedCount}`);
}
} }
await client.query('commit'); await client.query('commit');
console.log(`Imported ${insertedCount} ${config.label} areas from ${config.filePath}. Skipped ${skippedCount}.`); logger.info(`Completed ${config.label} import. Inserted ${insertedCount} areas, skipped ${skippedCount}.`);
} catch (error) { } catch (error) {
await client.query('rollback'); await client.query('rollback');
logger.error(`Import failed after processing ${index} features.`);
throw error; throw error;
} finally { } finally {
client.release(); client.release();
@@ -111,10 +130,13 @@ async function importDataset(config: PostalDatasetConfig) {
} }
async function run() { async function run() {
const logger = createScriptLogger('postal-import');
for (const config of datasetConfigs) { for (const config of datasetConfigs) {
await importDataset(config); await importDataset(config);
} }
logger.info('All postal datasets imported successfully.');
await getDbPool().end(); await getDbPool().end();
} }
+62 -8
View File
@@ -1,4 +1,8 @@
import { readFile } from 'node:fs/promises'; import { createReadStream } from 'node:fs';
import { chain } from 'stream-chain';
import { parser } from 'stream-json';
import { pick } from 'stream-json/filters/pick';
import { streamArray } from 'stream-json/streamers/stream-array';
export type FeatureGeometry = { export type FeatureGeometry = {
type: 'Polygon' | 'MultiPolygon'; type: 'Polygon' | 'MultiPolygon';
@@ -24,15 +28,49 @@ export type PostalDatasetConfig = {
displayNameKeys: string[]; displayNameKeys: string[];
}; };
export async function readFeatureCollection(filePath: string) { function findFirstCoordinatePair(coordinates: unknown): [number, number] | null {
const raw = await readFile(filePath, 'utf8'); if (!Array.isArray(coordinates)) {
const parsed = JSON.parse(raw) as GeoJsonFeatureCollection; return null;
if (parsed.type !== 'FeatureCollection' || !Array.isArray(parsed.features)) {
throw new Error(`Dataset at ${filePath} is not a valid GeoJSON FeatureCollection.`);
} }
return parsed; if (
coordinates.length >= 2 &&
typeof coordinates[0] === 'number' &&
Number.isFinite(coordinates[0]) &&
typeof coordinates[1] === 'number' &&
Number.isFinite(coordinates[1])
) {
return [coordinates[0], coordinates[1]];
}
for (const value of coordinates) {
const pair = findFirstCoordinatePair(value);
if (pair) {
return pair;
}
}
return null;
}
export async function *streamFeatureCollection(filePath: string) {
const pipeline = chain([
createReadStream(filePath, { encoding: 'utf8' }),
parser(),
pick({ filter: 'features' }) as any,
streamArray() as any,
]) as AsyncIterable<{ value: GeoJsonFeature }>;
let foundFeatures = false;
for await (const chunk of pipeline) {
foundFeatures = true;
yield chunk.value as GeoJsonFeature;
}
if (!foundFeatures) {
throw new Error(`Dataset at ${filePath} is not a valid GeoJSON FeatureCollection with a features array.`);
}
} }
export function normalizePostalCode(countryCode: 'US' | 'CA', rawPostalCode: string) { export function normalizePostalCode(countryCode: 'US' | 'CA', rawPostalCode: string) {
@@ -74,3 +112,19 @@ export function getFeatureGeometry(feature: GeoJsonFeature, filePath: string, in
return feature.geometry; return feature.geometry;
} }
export function assertGeometryIsWgs84(geometry: FeatureGeometry, filePath: string, index: number) {
const pair = findFirstCoordinatePair(geometry.coordinates);
if (!pair) {
throw new Error(`Feature ${index} in ${filePath} does not contain readable coordinates.`);
}
const [lng, lat] = pair;
if (Math.abs(lng) > 180 || Math.abs(lat) > 90) {
throw new Error(
`Feature ${index} in ${filePath} is not in WGS84 lon/lat coordinates. Re-export the dataset to GeoJSON with EPSG:4326, for example with ogr2ogr -t_srs EPSG:4326.`,
);
}
}
+22
View File
@@ -0,0 +1,22 @@
export function createScriptLogger(prefix: string) {
const startedAt = Date.now();
function elapsed() {
const totalSeconds = Math.floor((Date.now() - startedAt) / 1000);
const minutes = Math.floor(totalSeconds / 60);
const seconds = totalSeconds % 60;
return `${minutes}m ${seconds}s`;
}
return {
info(message: string) {
console.log(`[${prefix}] ${message} (${elapsed()})`);
},
warn(message: string) {
console.warn(`[${prefix}] ${message} (${elapsed()})`);
},
error(message: string) {
console.error(`[${prefix}] ${message} (${elapsed()})`);
},
};
}
+7
View File
@@ -0,0 +1,7 @@
declare module 'stream-json/filters/pick' {
export function pick(options: { filter: string }): NodeJS.ReadWriteStream;
}
declare module 'stream-json/streamers/stream-array' {
export function streamArray(): NodeJS.ReadWriteStream;
}
+22
View File
@@ -23,6 +23,7 @@
"pg-boss": "^12.14.0", "pg-boss": "^12.14.0",
"react": "^19.0.0", "react": "^19.0.0",
"react-dom": "^19.0.0", "react-dom": "^19.0.0",
"stream-json": "^2.1.0",
"tailwind-merge": "^3.5.0", "tailwind-merge": "^3.5.0",
"vite": "^6.2.0", "vite": "^6.2.0",
"zod": "^4.3.6" "zod": "^4.3.6"
@@ -3225,6 +3226,27 @@
"node": ">= 10.x" "node": ">= 10.x"
} }
}, },
"node_modules/stream-chain": {
"version": "3.6.1",
"resolved": "https://registry.npmjs.org/stream-chain/-/stream-chain-3.6.1.tgz",
"integrity": "sha512-M4BQpNPI71uumkVXjl4y+mIormQXdo4R0pSR23mcLbn6D+kpvu7Kx2g1hf0jRB76Zb1IT1M06OIGghMTAtZdyQ==",
"license": "BSD-3-Clause",
"funding": {
"url": "https://github.com/sponsors/uhop"
}
},
"node_modules/stream-json": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/stream-json/-/stream-json-2.1.0.tgz",
"integrity": "sha512-9gV/ywtebMn3DdKnNKYCb9iESvgR1dHbucNV+bRGvdvy+jV4c9FFgYKmENhpKv58jSwvs90Wk80RhfKk1KxHPg==",
"license": "BSD-3-Clause",
"dependencies": {
"stream-chain": "^3.6.1"
},
"funding": {
"url": "https://github.com/sponsors/uhop"
}
},
"node_modules/tagged-tag": { "node_modules/tagged-tag": {
"version": "1.0.0", "version": "1.0.0",
"resolved": "https://registry.npmjs.org/tagged-tag/-/tagged-tag-1.0.0.tgz", "resolved": "https://registry.npmjs.org/tagged-tag/-/tagged-tag-1.0.0.tgz",
+2
View File
@@ -16,6 +16,7 @@
"migrate": "tsx --tsconfig tsconfig.server.json db/scripts/migrate.ts", "migrate": "tsx --tsconfig tsconfig.server.json db/scripts/migrate.ts",
"import:postal": "tsx --tsconfig tsconfig.server.json db/scripts/import-postal-areas.ts", "import:postal": "tsx --tsconfig tsconfig.server.json db/scripts/import-postal-areas.ts",
"build:postal-neighbors": "tsx --tsconfig tsconfig.server.json db/scripts/build-postal-neighbors.ts", "build:postal-neighbors": "tsx --tsconfig tsconfig.server.json db/scripts/build-postal-neighbors.ts",
"check:postal": "tsx --tsconfig tsconfig.server.json db/scripts/check-postal-status.ts",
"seed:postal": "npm run import:postal && npm run build:postal-neighbors", "seed:postal": "npm run import:postal && npm run build:postal-neighbors",
"start:api": "node dist-server/server/src/index.js", "start:api": "node dist-server/server/src/index.js",
"start:worker": "node dist-server/server/src/worker.js" "start:worker": "node dist-server/server/src/worker.js"
@@ -36,6 +37,7 @@
"pg-boss": "^12.14.0", "pg-boss": "^12.14.0",
"react": "^19.0.0", "react": "^19.0.0",
"react-dom": "^19.0.0", "react-dom": "^19.0.0",
"stream-json": "^2.1.0",
"tailwind-merge": "^3.5.0", "tailwind-merge": "^3.5.0",
"vite": "^6.2.0", "vite": "^6.2.0",
"zod": "^4.3.6" "zod": "^4.3.6"