Initial

2023-01-26 09:36:05 -06:00 · 2023-01-26 09:36:05 -06:00 · 71f42e1d88
commit 71f42e1d88
14 changed files with 2142 additions and 0 deletions
--- a/.env
+++ b/.env
@ -0,0 +1 @@
+DATABASE_URL=postgres://postgres:example@localhost:5433/searchinblog
--- a/.envrc
+++ b/.envrc
@ -0,0 +1 @@
+use flake
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+/target
+.direnv
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,14 @@
+[package]
+name = "searchinblog"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow = "1.0.68"
+axum = { version = "0.6.4", features = ["http2", "macros"] }
+clap = { version = "4.1.4", features = ["derive", "cargo"] }
+reqwest = { version = "0.11.14", default-features = false, features = ["rustls-tls-webpki-roots", "json", "gzip", "mime_guess"] }
+scraper = "0.14.0"
+tokio = { version = "1.24.2", features = ["full"] }
--- a/README.md
+++ b/README.md
@ -0,0 +1,14 @@
+## Crawling requirements
+
+- You must serve a robots.txt to describe the terms of crawling. Otherwise, it
+  will be ignored by this bot.
+
+- You must serve an atom feed.
+
+## RFC References
+
+- **[RFC4287]**: The Atom Syndication Format
+- **[RFC9309]**: Proposed Robots Exclusion Protocol
+
+[rfc4287]: https://www.rfc-editor.org/rfc/rfc4287
+[rfc9309]: https://www.rfc-editor.org/rfc/rfc9309
--- a/default.nix
+++ b/default.nix
@ -0,0 +1,12 @@
+{ toolchain, makeRustPlatform, pkg-config }:
+
+let rustPlatform = makeRustPlatform { inherit (toolchain) cargo rustc; };
+
+in rustPlatform.buildRustPackage {
+  name = "searchinblog";
+  src = ./.;
+  cargoLock.lockFile = ./Cargo.lock;
+
+  nativeBuildInputs = [ pkg-config ];
+  buildInputs = [ ];
+}
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,8 @@
+version: "3"
+
+services:
+  db:
+    image: postgres
+    ports: [5433:5432]
+    environment:
+      POSTGRES_PASSWORD: example
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,93 @@
+{
+  "nodes": {
+    "fenix": {
+      "inputs": {
+        "nixpkgs": "nixpkgs",
+        "rust-analyzer-src": "rust-analyzer-src"
+      },
+      "locked": {
+        "lastModified": 1674541370,
+        "narHash": "sha256-L62dKDX6fIUQhlna9R8PKSAEGZ7ueU5gRGxZzZs/Zx8=",
+        "owner": "nix-community",
+        "repo": "fenix",
+        "rev": "4435f8e9da13581e51ba1f92a25d7d54c776ad94",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "repo": "fenix",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "locked": {
+        "lastModified": 1667395993,
+        "narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
+        "type": "github"
+      },
+      "original": {
+        "id": "flake-utils",
+        "type": "indirect"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1674211260,
+        "narHash": "sha256-xU6Rv9sgnwaWK7tgCPadV6HhI2Y/fl4lKxJoG2+m9qs=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "5ed481943351e9fd354aeb557679624224de38d5",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "nixpkgs_2": {
+      "locked": {
+        "lastModified": 1667629849,
+        "narHash": "sha256-P+v+nDOFWicM4wziFK9S/ajF2lc0N2Rg9p6Y35uMoZI=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "3bacde6273b09a21a8ccfba15586fb165078fb62",
+        "type": "github"
+      },
+      "original": {
+        "id": "nixpkgs",
+        "type": "indirect"
+      }
+    },
+    "root": {
+      "inputs": {
+        "fenix": "fenix",
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs_2"
+      }
+    },
+    "rust-analyzer-src": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1674491573,
+        "narHash": "sha256-1hMPOn2dlMfWRWvuaWcSxNquKpvjGVXq2rVw6UJy75Q=",
+        "owner": "rust-lang",
+        "repo": "rust-analyzer",
+        "rev": "c552e5a55f13b2f08d506bb46fb74dbc11702d0d",
+        "type": "github"
+      },
+      "original": {
+        "owner": "rust-lang",
+        "ref": "nightly",
+        "repo": "rust-analyzer",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,28 @@
+{
+  inputs = { fenix.url = "github:nix-community/fenix"; };
+
+  outputs = { self, nixpkgs, flake-utils, fenix }:
+    flake-utils.lib.eachDefaultSystem (system:
+      let
+        pkgs = import nixpkgs {
+          inherit system;
+          overlays = [ fenix.overlays.default ];
+        };
+
+        toolchain = pkgs.fenix.stable;
+
+        flakePkgs = {
+          searchinblog = pkgs.callPackage ./. { inherit toolchain; };
+        };
+      in rec {
+        packages = flake-utils.lib.flattenTree flakePkgs;
+        defaultPackage = packages.searchinblog;
+
+        devShell = pkgs.mkShell {
+          inputsFrom = with packages; [ searchinblog ];
+          packages = (with pkgs; [ cargo-watch cargo-deny cargo-edit sqlx-cli ])
+            ++ (with toolchain; [ cargo rustc rustfmt ]);
+          CARGO_UNSTABLE_SPARSE_REGISTRY = "true";
+        };
+      });
+}
--- a/migrations/20230125084829_initial.sql
+++ b/migrations/20230125084829_initial.sql
@ -0,0 +1,34 @@
+CREATE TABLE websites (
+  id INTEGER PRIMARY KEY,
+  domain TEXT UNIQUE,
+  last_collected TIMESTAMP WITHOUT TIME ZONE
+);
+
+CREATE TABLE posts (
+  id INTEGER PRIMARY KEY,
+  url TEXT UNIQUE,
+  last_updated_by_version TEXT,
+  content TEXT,
+  content_tsv TSVECTOR
+);
+
+CREATE INDEX "posts_fti_idx" ON posts USING GIN(content_tsv);
+CREATE TRIGGER "posts_fti_trigger" AFTER UPDATE OR INSERT OR DELETE
+  ON posts FOR EACH ROW 
+  EXECUTE PROCEDURE tsvector_update_trigger(posts_fti, content_tsv, 'pg_catalog.english', content);
+
+CREATE TABLE authors (
+  id INTEGER PRIMARY KEY,
+  url TEXT UNIQUE
+);
+
+CREATE TABLE posts_keywords (
+  post_id INTEGER,
+  keyword TEXT,
+  
+  PRIMARY KEY(post_id, keyword),
+  CONSTRAINT fk_post FOREIGN KEY(post_id) REFERENCES posts(id)
+  -- CONSTRAINT fk_keyword FOREIGN KEY(keyword) REFERENCES keywords(keyword)
+);
+
+CREATE INDEX "posts_keywords_idx" ON posts_keywords(keyword);
--- a/rustfmt.toml
+++ b/rustfmt.toml
@ -0,0 +1,2 @@
+max_width = 80
+tab_spaces = 2
--- a/src/crawler.rs
+++ b/src/crawler.rs
@ -0,0 +1,54 @@
+//! Crawler
+//! ===
+//!
+//! The crawler design is built upon an important Postgres feature: Pubsub. It always waits on 2
+//! events:
+//!
+//! - One that listens to Postgres pubsub triggers
+//! - One that waits on a timer based on the next available job
+//!
+//! Once one of these events fires, it retrieves the job from the database and performs the crawl.
+//! The crawl will try to perform a retrieval and populate the database and fill the full text
+//! index.
+
+use anyhow::Result;
+use reqwest::Url;
+use scraper::{Html, Selector};
+
+pub async fn crawler() -> Result<()> {
+  Ok(())
+}
+
+#[derive(Debug, Parser)]
+pub struct CrawlSingleSiteOpts {
+  /// The domain to crawl
+  #[clap(name = "domain", long)]
+  domain: String,
+}
+
+/// Crawl a single site.
+pub async fn crawl_single_site(opts: CrawlSingleSiteOpts) -> Result<()> {
+  let CrawlSingleSiteOpts { domain, .. } = opts;
+
+  // Check the robots.txt of the site to see if or how we can crawl
+  let url = Url::parse(&format!("https://{domain}/robots.txt"))?;
+  let response = reqwest::get(url).await?;
+  println!("Response: {response:?}");
+
+  // Try to look for an atom feed by parsing HTML headers
+  let url = Url::parse(&format!("https://{domain}"))?;
+  let response = reqwest::get(url).await?;
+  let body = response.text().await?;
+  let html = Html::parse_document(&body);
+  let tag = Selector::parse("link[type=\"application/atom+xml\"]").unwrap();
+  for element in html.select(&tag) {
+    let attrs = element.value().attrs();
+    for attr in attrs {
+      println!("Attrs: {attr:?}");
+    }
+  }
+
+  // Go through each of the files in the atom feed
+
+  Ok(())
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,47 @@
+#[macro_use]
+extern crate clap;
+
+mod crawler;
+
+use anyhow::Result;
+use clap::Parser;
+
+use crate::crawler::{crawl_single_site, crawler, CrawlSingleSiteOpts};
+
+#[derive(Parser, Debug)]
+struct Opt {
+  #[clap(subcommand)]
+  command: Subcommand,
+}
+
+#[derive(Parser, Debug)]
+enum Subcommand {
+  #[clap(name = "crawler")]
+  Crawler,
+
+  #[clap(name = "web")]
+  Web,
+
+  /// Crawl just a single site
+  #[clap(name = "crawl-single-site")]
+  CrawlSingleSite(CrawlSingleSiteOpts),
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+  let opt = Opt::parse();
+
+  println!("Hello, world! {opt:?}");
+
+  match opt.command {
+    Subcommand::Web => {}
+    Subcommand::Crawler => {
+      crawler().await?;
+    }
+    Subcommand::CrawlSingleSite(opts) => {
+      crawl_single_site(opts).await?;
+    }
+  }
+
+  Ok(())
+}
				`@ -0,0 +1 @@`
				`DATABASE_URL=postgres://postgres:example@localhost:5433/searchinblog`