From dc7686f50741ecde43a5ee8e1d23a97cbb05211d Mon Sep 17 00:00:00 2001 From: pguerrerox Date: Sun, 12 Apr 2026 23:22:36 +0000 Subject: [PATCH] chore: improve postal data import observability Add progress logging and a status script for postal imports and neighbor builds, and ignore local raw and generated postal datasets. --- .gitignore | 3 ++ CHANGELOG.md | 5 +- README.md | 15 ++++++ db/scripts/build-postal-neighbors.ts | 71 ++++++++++++++++++++++++---- db/scripts/check-postal-status.ts | 41 ++++++++++++++++ db/scripts/import-postal-areas.ts | 32 +++++++++++-- db/scripts/postal-import-utils.ts | 70 +++++++++++++++++++++++---- db/scripts/postal-logging.ts | 22 +++++++++ db/scripts/stream-json-modules.d.ts | 7 +++ package-lock.json | 22 +++++++++ package.json | 2 + 11 files changed, 267 insertions(+), 23 deletions(-) create mode 100644 db/scripts/check-postal-status.ts create mode 100644 db/scripts/postal-logging.ts create mode 100644 db/scripts/stream-json-modules.d.ts diff --git a/.gitignore b/.gitignore index 0f09ab7..4e3861c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ coverage/ *.log .env* !.env.example +db/rawdata/ +db/datasets/postal/*.geojson +!db/datasets/postal/.gitkeep diff --git a/CHANGELOG.md b/CHANGELOG.md index 59b1db2..94216cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,11 +2,12 @@ All notable changes to this project are documented in this file. -## 2026-03-27 +## 2026-04-12 ### Changed - Improved local development networking by making API env loading work with `.env.local`, adding LAN-friendly API URL fallback behavior, and fixing development CORS handling. - Fixed local research inserts so nullable Google Places coordinates no longer break business upserts. +- Improved postal data tooling with streaming imports, clearer CRS validation, progress logs, and a status command for checking imported areas and adjacency counts. ### Added - Added postal-area import and adjacency build scripts for US ZIP/ZCTA and Canada FSA datasets. @@ -16,6 +17,8 @@ All notable changes to this project are documented in this file. ### Removed - Removed stale local metadata, placeholder postal seeding code, and leftover Supabase-era repository artifacts. +## 2026-03-27 + ### Changed - Migrated the app from a Supabase runtime to a local Fastify API with PostgreSQL, PostGIS, and cookie-based session auth. - Reworked the research experience with the `Leads4less` branding, the renamed `Research` view, a top-form layout, and a filterable grid of research jobs. diff --git a/README.md b/README.md index 311a3c5..49cb576 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,21 @@ If you open the app from another machine on your LAN, set `VITE_API_BASE_URL` an 3. Import and build adjacency: `npm run seed:postal` +Notes: +- Raw postal source files and generated GeoJSON datasets under `db/rawdata/` and `db/datasets/postal/*.geojson` are ignored by git. +- The importer expects WGS84 lon/lat GeoJSON. If a source file is projected, re-export it to `EPSG:4326` before importing. + +Observability commands: +- `npm run import:postal` logs progress every 500 features by default. +- `npm run build:postal-neighbors` logs per-country progress and final adjacency totals. +- `npm run check:postal` prints current postal area and neighbor counts from the database. + +You can change the import log interval with `POSTAL_IMPORT_LOG_EVERY`, for example: +`POSTAL_IMPORT_LOG_EVERY=1000 npm run import:postal` + +If you want to validate the current postal tables after an import or neighbor build, run: +`npm run check:postal` + ## Google Maps Requirements Enable these Google Cloud APIs for the keys you use: diff --git a/db/scripts/build-postal-neighbors.ts b/db/scripts/build-postal-neighbors.ts index 31f7a93..c868409 100644 --- a/db/scripts/build-postal-neighbors.ts +++ b/db/scripts/build-postal-neighbors.ts @@ -1,13 +1,32 @@ import { getDbPool } from '../../server/src/db/pool.js'; +import { createScriptLogger } from './postal-logging.js'; -async function run() { +async function logPostalAreaCounts() { const pool = getDbPool(); - const client = await pool.connect(); + const result = await pool.query<{ country_code: string; area_count: string }>(` + select country_code, count(*)::text as area_count + from public.postal_areas + group by country_code + order by country_code asc + `); - try { - await client.query('begin'); - await client.query('truncate table public.postal_area_neighbors'); - await client.query(` + return result.rows; +} + +async function buildNeighborsForCountry(countryCode: 'US' | 'CA') { + const pool = getDbPool(); + const logger = createScriptLogger(`postal-neighbors:${countryCode.toLowerCase()}`); + + const sourceCountResult = await pool.query<{ area_count: string }>( + `select count(*)::text as area_count from public.postal_areas where country_code = $1`, + [countryCode], + ); + + const areaCount = sourceCountResult.rows[0]?.area_count ?? '0'; + logger.info(`Starting neighbor build for ${countryCode} with ${areaCount} postal areas.`); + + await pool.query( + ` insert into public.postal_area_neighbors (postal_area_id, neighbor_postal_area_id) select source.id, neighbor.id from public.postal_areas source @@ -15,13 +34,47 @@ async function run() { on source.country_code = neighbor.country_code and source.id <> neighbor.id and ST_Touches(source.geom, neighbor.geom) - `); + where source.country_code = $1 + `, + [countryCode], + ); + + const neighborCountResult = await pool.query<{ neighbor_count: string }>( + ` + select count(*)::text as neighbor_count + from public.postal_area_neighbors link + join public.postal_areas area on area.id = link.postal_area_id + where area.country_code = $1 + `, + [countryCode], + ); + + logger.info(`Completed neighbor build for ${countryCode}. Built ${neighborCountResult.rows[0]?.neighbor_count ?? '0'} adjacency links.`); +} + +async function run() { + const logger = createScriptLogger('postal-neighbors'); + const pool = getDbPool(); + const client = await pool.connect(); + + try { + logger.info('Gathering postal area counts before neighbor build.'); + const counts = await logPostalAreaCounts(); + counts.forEach((row) => logger.info(`Found ${row.area_count} postal areas for ${row.country_code}.`)); + + await client.query('begin'); + logger.info('Clearing existing postal adjacency links.'); + await client.query('truncate table public.postal_area_neighbors'); await client.query('commit'); - const summary = await client.query<{ count: string }>('select count(*)::text as count from public.postal_area_neighbors'); - console.log(`Built ${summary.rows[0]?.count ?? '0'} postal adjacency links.`); + await buildNeighborsForCountry('US'); + await buildNeighborsForCountry('CA'); + + const summary = await pool.query<{ count: string }>('select count(*)::text as count from public.postal_area_neighbors'); + logger.info(`Finished building postal neighbors. Total adjacency links: ${summary.rows[0]?.count ?? '0'}.`); } catch (error) { await client.query('rollback'); + logger.error('Neighbor build failed.'); throw error; } finally { client.release(); diff --git a/db/scripts/check-postal-status.ts b/db/scripts/check-postal-status.ts new file mode 100644 index 0000000..e59055f --- /dev/null +++ b/db/scripts/check-postal-status.ts @@ -0,0 +1,41 @@ +import { getDbPool } from '../../server/src/db/pool.js'; +import { createScriptLogger } from './postal-logging.js'; + +async function run() { + const logger = createScriptLogger('postal-status'); + const pool = getDbPool(); + + try { + const postalAreasByCountry = await pool.query<{ country_code: string; area_count: string }>(` + select country_code, count(*)::text as area_count + from public.postal_areas + group by country_code + order by country_code asc + `); + + const neighborCountsByCountry = await pool.query<{ country_code: string; neighbor_count: string }>(` + select area.country_code, count(*)::text as neighbor_count + from public.postal_area_neighbors link + join public.postal_areas area on area.id = link.postal_area_id + group by area.country_code + order by area.country_code asc + `); + + const totalAreas = await pool.query<{ count: string }>('select count(*)::text as count from public.postal_areas'); + const totalNeighbors = await pool.query<{ count: string }>('select count(*)::text as count from public.postal_area_neighbors'); + + logger.info(`Postal areas loaded: ${totalAreas.rows[0]?.count ?? '0'}`); + postalAreasByCountry.rows.forEach((row) => { + logger.info(` ${row.country_code}: ${row.area_count} postal areas`); + }); + + logger.info(`Postal neighbor links built: ${totalNeighbors.rows[0]?.count ?? '0'}`); + neighborCountsByCountry.rows.forEach((row) => { + logger.info(` ${row.country_code}: ${row.neighbor_count} adjacency links`); + }); + } finally { + await pool.end(); + } +} + +await run(); diff --git a/db/scripts/import-postal-areas.ts b/db/scripts/import-postal-areas.ts index a4357b8..631f601 100644 --- a/db/scripts/import-postal-areas.ts +++ b/db/scripts/import-postal-areas.ts @@ -1,7 +1,8 @@ import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { getDbPool } from '../../server/src/db/pool.js'; -import { getFeatureGeometry, getStringProperty, normalizePostalCode, readFeatureCollection, type PostalDatasetConfig } from './postal-import-utils.js'; +import { createScriptLogger } from './postal-logging.js'; +import { assertGeometryIsWgs84, getFeatureGeometry, getStringProperty, normalizePostalCode, streamFeatureCollection, type PostalDatasetConfig } from './postal-import-utils.js'; const currentDir = path.dirname(fileURLToPath(import.meta.url)); const datasetsRoot = path.resolve(currentDir, '../datasets/postal'); @@ -23,33 +24,46 @@ const datasetConfigs: PostalDatasetConfig[] = [ }, ]; +const logEvery = Math.max(1, Number.parseInt(process.env.POSTAL_IMPORT_LOG_EVERY ?? '500', 10) || 500); + async function importDataset(config: PostalDatasetConfig) { const pool = getDbPool(); const client = await pool.connect(); + const logger = createScriptLogger(`postal-import:${config.countryCode.toLowerCase()}`); + let index = 0; try { - const collection = await readFeatureCollection(config.filePath); let insertedCount = 0; let skippedCount = 0; + logger.info(`Starting ${config.label} import from ${config.filePath}`); + await client.query('begin'); - for (const [index, feature] of collection.features.entries()) { + for await (const feature of streamFeatureCollection(config.filePath)) { + index += 1; const rawPostalCode = getStringProperty(feature.properties, config.postalCodeKeys); if (!rawPostalCode) { skippedCount += 1; + if (index % logEvery === 0) { + logger.info(`Processed ${index} features, inserted ${insertedCount}, skipped ${skippedCount}`); + } continue; } const normalizedPostalCode = normalizePostalCode(config.countryCode, rawPostalCode); if (!normalizedPostalCode) { skippedCount += 1; + if (index % logEvery === 0) { + logger.info(`Processed ${index} features, inserted ${insertedCount}, skipped ${skippedCount}`); + } continue; } const displayName = getStringProperty(feature.properties, config.displayNameKeys) || normalizedPostalCode; const geometry = getFeatureGeometry(feature, config.filePath, index); + assertGeometryIsWgs84(geometry, config.filePath, index); await client.query( ` @@ -72,7 +86,7 @@ async function importDataset(config: PostalDatasetConfig) { $4, ST_Multi(ST_SetSRID(ST_GeomFromGeoJSON($5), 4326)), ST_Centroid(ST_SetSRID(ST_GeomFromGeoJSON($5), 4326))::geography, - greatest(1000, round(sqrt(ST_Area(ST_SetSRID(ST_GeomFromGeoJSON($5), 4326)::geography) / pi()))::integer), + greatest(1000, round(sqrt(ST_Area(ST_Transform(ST_SetSRID(ST_GeomFromGeoJSON($5), 4326), 3857)) / pi()))::integer), $6::jsonb, now(), now() @@ -98,12 +112,17 @@ async function importDataset(config: PostalDatasetConfig) { ); insertedCount += 1; + + if (index % logEvery === 0) { + logger.info(`Processed ${index} features, inserted ${insertedCount}, skipped ${skippedCount}`); + } } await client.query('commit'); - console.log(`Imported ${insertedCount} ${config.label} areas from ${config.filePath}. Skipped ${skippedCount}.`); + logger.info(`Completed ${config.label} import. Inserted ${insertedCount} areas, skipped ${skippedCount}.`); } catch (error) { await client.query('rollback'); + logger.error(`Import failed after processing ${index} features.`); throw error; } finally { client.release(); @@ -111,10 +130,13 @@ async function importDataset(config: PostalDatasetConfig) { } async function run() { + const logger = createScriptLogger('postal-import'); + for (const config of datasetConfigs) { await importDataset(config); } + logger.info('All postal datasets imported successfully.'); await getDbPool().end(); } diff --git a/db/scripts/postal-import-utils.ts b/db/scripts/postal-import-utils.ts index 83e3af2..db71210 100644 --- a/db/scripts/postal-import-utils.ts +++ b/db/scripts/postal-import-utils.ts @@ -1,4 +1,8 @@ -import { readFile } from 'node:fs/promises'; +import { createReadStream } from 'node:fs'; +import { chain } from 'stream-chain'; +import { parser } from 'stream-json'; +import { pick } from 'stream-json/filters/pick'; +import { streamArray } from 'stream-json/streamers/stream-array'; export type FeatureGeometry = { type: 'Polygon' | 'MultiPolygon'; @@ -24,15 +28,49 @@ export type PostalDatasetConfig = { displayNameKeys: string[]; }; -export async function readFeatureCollection(filePath: string) { - const raw = await readFile(filePath, 'utf8'); - const parsed = JSON.parse(raw) as GeoJsonFeatureCollection; - - if (parsed.type !== 'FeatureCollection' || !Array.isArray(parsed.features)) { - throw new Error(`Dataset at ${filePath} is not a valid GeoJSON FeatureCollection.`); +function findFirstCoordinatePair(coordinates: unknown): [number, number] | null { + if (!Array.isArray(coordinates)) { + return null; } - return parsed; + if ( + coordinates.length >= 2 && + typeof coordinates[0] === 'number' && + Number.isFinite(coordinates[0]) && + typeof coordinates[1] === 'number' && + Number.isFinite(coordinates[1]) + ) { + return [coordinates[0], coordinates[1]]; + } + + for (const value of coordinates) { + const pair = findFirstCoordinatePair(value); + if (pair) { + return pair; + } + } + + return null; +} + +export async function *streamFeatureCollection(filePath: string) { + const pipeline = chain([ + createReadStream(filePath, { encoding: 'utf8' }), + parser(), + pick({ filter: 'features' }) as any, + streamArray() as any, + ]) as AsyncIterable<{ value: GeoJsonFeature }>; + + let foundFeatures = false; + + for await (const chunk of pipeline) { + foundFeatures = true; + yield chunk.value as GeoJsonFeature; + } + + if (!foundFeatures) { + throw new Error(`Dataset at ${filePath} is not a valid GeoJSON FeatureCollection with a features array.`); + } } export function normalizePostalCode(countryCode: 'US' | 'CA', rawPostalCode: string) { @@ -74,3 +112,19 @@ export function getFeatureGeometry(feature: GeoJsonFeature, filePath: string, in return feature.geometry; } + +export function assertGeometryIsWgs84(geometry: FeatureGeometry, filePath: string, index: number) { + const pair = findFirstCoordinatePair(geometry.coordinates); + + if (!pair) { + throw new Error(`Feature ${index} in ${filePath} does not contain readable coordinates.`); + } + + const [lng, lat] = pair; + + if (Math.abs(lng) > 180 || Math.abs(lat) > 90) { + throw new Error( + `Feature ${index} in ${filePath} is not in WGS84 lon/lat coordinates. Re-export the dataset to GeoJSON with EPSG:4326, for example with ogr2ogr -t_srs EPSG:4326.`, + ); + } +} diff --git a/db/scripts/postal-logging.ts b/db/scripts/postal-logging.ts new file mode 100644 index 0000000..070431e --- /dev/null +++ b/db/scripts/postal-logging.ts @@ -0,0 +1,22 @@ +export function createScriptLogger(prefix: string) { + const startedAt = Date.now(); + + function elapsed() { + const totalSeconds = Math.floor((Date.now() - startedAt) / 1000); + const minutes = Math.floor(totalSeconds / 60); + const seconds = totalSeconds % 60; + return `${minutes}m ${seconds}s`; + } + + return { + info(message: string) { + console.log(`[${prefix}] ${message} (${elapsed()})`); + }, + warn(message: string) { + console.warn(`[${prefix}] ${message} (${elapsed()})`); + }, + error(message: string) { + console.error(`[${prefix}] ${message} (${elapsed()})`); + }, + }; +} diff --git a/db/scripts/stream-json-modules.d.ts b/db/scripts/stream-json-modules.d.ts new file mode 100644 index 0000000..cad7739 --- /dev/null +++ b/db/scripts/stream-json-modules.d.ts @@ -0,0 +1,7 @@ +declare module 'stream-json/filters/pick' { + export function pick(options: { filter: string }): NodeJS.ReadWriteStream; +} + +declare module 'stream-json/streamers/stream-array' { + export function streamArray(): NodeJS.ReadWriteStream; +} diff --git a/package-lock.json b/package-lock.json index b88082a..86efadf 100644 --- a/package-lock.json +++ b/package-lock.json @@ -23,6 +23,7 @@ "pg-boss": "^12.14.0", "react": "^19.0.0", "react-dom": "^19.0.0", + "stream-json": "^2.1.0", "tailwind-merge": "^3.5.0", "vite": "^6.2.0", "zod": "^4.3.6" @@ -3225,6 +3226,27 @@ "node": ">= 10.x" } }, + "node_modules/stream-chain": { + "version": "3.6.1", + "resolved": "https://registry.npmjs.org/stream-chain/-/stream-chain-3.6.1.tgz", + "integrity": "sha512-M4BQpNPI71uumkVXjl4y+mIormQXdo4R0pSR23mcLbn6D+kpvu7Kx2g1hf0jRB76Zb1IT1M06OIGghMTAtZdyQ==", + "license": "BSD-3-Clause", + "funding": { + "url": "https://github.com/sponsors/uhop" + } + }, + "node_modules/stream-json": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/stream-json/-/stream-json-2.1.0.tgz", + "integrity": "sha512-9gV/ywtebMn3DdKnNKYCb9iESvgR1dHbucNV+bRGvdvy+jV4c9FFgYKmENhpKv58jSwvs90Wk80RhfKk1KxHPg==", + "license": "BSD-3-Clause", + "dependencies": { + "stream-chain": "^3.6.1" + }, + "funding": { + "url": "https://github.com/sponsors/uhop" + } + }, "node_modules/tagged-tag": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/tagged-tag/-/tagged-tag-1.0.0.tgz", diff --git a/package.json b/package.json index 67d4c38..0babfc7 100644 --- a/package.json +++ b/package.json @@ -16,6 +16,7 @@ "migrate": "tsx --tsconfig tsconfig.server.json db/scripts/migrate.ts", "import:postal": "tsx --tsconfig tsconfig.server.json db/scripts/import-postal-areas.ts", "build:postal-neighbors": "tsx --tsconfig tsconfig.server.json db/scripts/build-postal-neighbors.ts", + "check:postal": "tsx --tsconfig tsconfig.server.json db/scripts/check-postal-status.ts", "seed:postal": "npm run import:postal && npm run build:postal-neighbors", "start:api": "node dist-server/server/src/index.js", "start:worker": "node dist-server/server/src/worker.js" @@ -36,6 +37,7 @@ "pg-boss": "^12.14.0", "react": "^19.0.0", "react-dom": "^19.0.0", + "stream-json": "^2.1.0", "tailwind-merge": "^3.5.0", "vite": "^6.2.0", "zod": "^4.3.6"