This commit is contained in:
Michael Zhang 2023-01-26 09:36:05 -06:00
commit 71f42e1d88
14 changed files with 2142 additions and 0 deletions

1
.env Normal file
View file

@ -0,0 +1 @@
DATABASE_URL=postgres://postgres:example@localhost:5433/searchinblog

1
.envrc Normal file
View file

@ -0,0 +1 @@
use flake

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
/target
.direnv

1832
Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

14
Cargo.toml Normal file
View file

@ -0,0 +1,14 @@
[package]
name = "searchinblog"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.68"
axum = { version = "0.6.4", features = ["http2", "macros"] }
clap = { version = "4.1.4", features = ["derive", "cargo"] }
reqwest = { version = "0.11.14", default-features = false, features = ["rustls-tls-webpki-roots", "json", "gzip", "mime_guess"] }
scraper = "0.14.0"
tokio = { version = "1.24.2", features = ["full"] }

14
README.md Normal file
View file

@ -0,0 +1,14 @@
## Crawling requirements
- You must serve a robots.txt to describe the terms of crawling. Otherwise, it
will be ignored by this bot.
- You must serve an atom feed.
## RFC References
- **[RFC4287]**: The Atom Syndication Format
- **[RFC9309]**: Proposed Robots Exclusion Protocol
[rfc4287]: https://www.rfc-editor.org/rfc/rfc4287
[rfc9309]: https://www.rfc-editor.org/rfc/rfc9309

12
default.nix Normal file
View file

@ -0,0 +1,12 @@
{ toolchain, makeRustPlatform, pkg-config }:
let rustPlatform = makeRustPlatform { inherit (toolchain) cargo rustc; };
in rustPlatform.buildRustPackage {
name = "searchinblog";
src = ./.;
cargoLock.lockFile = ./Cargo.lock;
nativeBuildInputs = [ pkg-config ];
buildInputs = [ ];
}

8
docker-compose.yml Normal file
View file

@ -0,0 +1,8 @@
version: "3"
services:
db:
image: postgres
ports: [5433:5432]
environment:
POSTGRES_PASSWORD: example

93
flake.lock Normal file
View file

@ -0,0 +1,93 @@
{
"nodes": {
"fenix": {
"inputs": {
"nixpkgs": "nixpkgs",
"rust-analyzer-src": "rust-analyzer-src"
},
"locked": {
"lastModified": 1674541370,
"narHash": "sha256-L62dKDX6fIUQhlna9R8PKSAEGZ7ueU5gRGxZzZs/Zx8=",
"owner": "nix-community",
"repo": "fenix",
"rev": "4435f8e9da13581e51ba1f92a25d7d54c776ad94",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "fenix",
"type": "github"
}
},
"flake-utils": {
"locked": {
"lastModified": 1667395993,
"narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
"type": "github"
},
"original": {
"id": "flake-utils",
"type": "indirect"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1674211260,
"narHash": "sha256-xU6Rv9sgnwaWK7tgCPadV6HhI2Y/fl4lKxJoG2+m9qs=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "5ed481943351e9fd354aeb557679624224de38d5",
"type": "github"
},
"original": {
"owner": "nixos",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"nixpkgs_2": {
"locked": {
"lastModified": 1667629849,
"narHash": "sha256-P+v+nDOFWicM4wziFK9S/ajF2lc0N2Rg9p6Y35uMoZI=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "3bacde6273b09a21a8ccfba15586fb165078fb62",
"type": "github"
},
"original": {
"id": "nixpkgs",
"type": "indirect"
}
},
"root": {
"inputs": {
"fenix": "fenix",
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs_2"
}
},
"rust-analyzer-src": {
"flake": false,
"locked": {
"lastModified": 1674491573,
"narHash": "sha256-1hMPOn2dlMfWRWvuaWcSxNquKpvjGVXq2rVw6UJy75Q=",
"owner": "rust-lang",
"repo": "rust-analyzer",
"rev": "c552e5a55f13b2f08d506bb46fb74dbc11702d0d",
"type": "github"
},
"original": {
"owner": "rust-lang",
"ref": "nightly",
"repo": "rust-analyzer",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

28
flake.nix Normal file
View file

@ -0,0 +1,28 @@
{
inputs = { fenix.url = "github:nix-community/fenix"; };
outputs = { self, nixpkgs, flake-utils, fenix }:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = import nixpkgs {
inherit system;
overlays = [ fenix.overlays.default ];
};
toolchain = pkgs.fenix.stable;
flakePkgs = {
searchinblog = pkgs.callPackage ./. { inherit toolchain; };
};
in rec {
packages = flake-utils.lib.flattenTree flakePkgs;
defaultPackage = packages.searchinblog;
devShell = pkgs.mkShell {
inputsFrom = with packages; [ searchinblog ];
packages = (with pkgs; [ cargo-watch cargo-deny cargo-edit sqlx-cli ])
++ (with toolchain; [ cargo rustc rustfmt ]);
CARGO_UNSTABLE_SPARSE_REGISTRY = "true";
};
});
}

View file

@ -0,0 +1,34 @@
CREATE TABLE websites (
id INTEGER PRIMARY KEY,
domain TEXT UNIQUE,
last_collected TIMESTAMP WITHOUT TIME ZONE
);
CREATE TABLE posts (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE,
last_updated_by_version TEXT,
content TEXT,
content_tsv TSVECTOR
);
CREATE INDEX "posts_fti_idx" ON posts USING GIN(content_tsv);
CREATE TRIGGER "posts_fti_trigger" AFTER UPDATE OR INSERT OR DELETE
ON posts FOR EACH ROW
EXECUTE PROCEDURE tsvector_update_trigger(posts_fti, content_tsv, 'pg_catalog.english', content);
CREATE TABLE authors (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE
);
CREATE TABLE posts_keywords (
post_id INTEGER,
keyword TEXT,
PRIMARY KEY(post_id, keyword),
CONSTRAINT fk_post FOREIGN KEY(post_id) REFERENCES posts(id)
-- CONSTRAINT fk_keyword FOREIGN KEY(keyword) REFERENCES keywords(keyword)
);
CREATE INDEX "posts_keywords_idx" ON posts_keywords(keyword);

2
rustfmt.toml Normal file
View file

@ -0,0 +1,2 @@
max_width = 80
tab_spaces = 2

54
src/crawler.rs Normal file
View file

@ -0,0 +1,54 @@
//! Crawler
//! ===
//!
//! The crawler design is built upon an important Postgres feature: Pubsub. It always waits on 2
//! events:
//!
//! - One that listens to Postgres pubsub triggers
//! - One that waits on a timer based on the next available job
//!
//! Once one of these events fires, it retrieves the job from the database and performs the crawl.
//! The crawl will try to perform a retrieval and populate the database and fill the full text
//! index.
use anyhow::Result;
use reqwest::Url;
use scraper::{Html, Selector};
pub async fn crawler() -> Result<()> {
Ok(())
}
#[derive(Debug, Parser)]
pub struct CrawlSingleSiteOpts {
/// The domain to crawl
#[clap(name = "domain", long)]
domain: String,
}
/// Crawl a single site.
pub async fn crawl_single_site(opts: CrawlSingleSiteOpts) -> Result<()> {
let CrawlSingleSiteOpts { domain, .. } = opts;
// Check the robots.txt of the site to see if or how we can crawl
let url = Url::parse(&format!("https://{domain}/robots.txt"))?;
let response = reqwest::get(url).await?;
println!("Response: {response:?}");
// Try to look for an atom feed by parsing HTML headers
let url = Url::parse(&format!("https://{domain}"))?;
let response = reqwest::get(url).await?;
let body = response.text().await?;
let html = Html::parse_document(&body);
let tag = Selector::parse("link[type=\"application/atom+xml\"]").unwrap();
for element in html.select(&tag) {
let attrs = element.value().attrs();
for attr in attrs {
println!("Attrs: {attr:?}");
}
}
// Go through each of the files in the atom feed
Ok(())
}

47
src/main.rs Normal file
View file

@ -0,0 +1,47 @@
#[macro_use]
extern crate clap;
mod crawler;
use anyhow::Result;
use clap::Parser;
use crate::crawler::{crawl_single_site, crawler, CrawlSingleSiteOpts};
#[derive(Parser, Debug)]
struct Opt {
#[clap(subcommand)]
command: Subcommand,
}
#[derive(Parser, Debug)]
enum Subcommand {
#[clap(name = "crawler")]
Crawler,
#[clap(name = "web")]
Web,
/// Crawl just a single site
#[clap(name = "crawl-single-site")]
CrawlSingleSite(CrawlSingleSiteOpts),
}
#[tokio::main]
async fn main() -> Result<()> {
let opt = Opt::parse();
println!("Hello, world! {opt:?}");
match opt.command {
Subcommand::Web => {}
Subcommand::Crawler => {
crawler().await?;
}
Subcommand::CrawlSingleSite(opts) => {
crawl_single_site(opts).await?;
}
}
Ok(())
}