Initial
This commit is contained in:
commit
71f42e1d88
14 changed files with 2142 additions and 0 deletions
1
.env
Normal file
1
.env
Normal file
|
@ -0,0 +1 @@
|
||||||
|
DATABASE_URL=postgres://postgres:example@localhost:5433/searchinblog
|
1
.envrc
Normal file
1
.envrc
Normal file
|
@ -0,0 +1 @@
|
||||||
|
use flake
|
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
/target
|
||||||
|
.direnv
|
1832
Cargo.lock
generated
Normal file
1832
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
14
Cargo.toml
Normal file
14
Cargo.toml
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
[package]
|
||||||
|
name = "searchinblog"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
anyhow = "1.0.68"
|
||||||
|
axum = { version = "0.6.4", features = ["http2", "macros"] }
|
||||||
|
clap = { version = "4.1.4", features = ["derive", "cargo"] }
|
||||||
|
reqwest = { version = "0.11.14", default-features = false, features = ["rustls-tls-webpki-roots", "json", "gzip", "mime_guess"] }
|
||||||
|
scraper = "0.14.0"
|
||||||
|
tokio = { version = "1.24.2", features = ["full"] }
|
14
README.md
Normal file
14
README.md
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
## Crawling requirements
|
||||||
|
|
||||||
|
- You must serve a robots.txt to describe the terms of crawling. Otherwise, it
|
||||||
|
will be ignored by this bot.
|
||||||
|
|
||||||
|
- You must serve an atom feed.
|
||||||
|
|
||||||
|
## RFC References
|
||||||
|
|
||||||
|
- **[RFC4287]**: The Atom Syndication Format
|
||||||
|
- **[RFC9309]**: Proposed Robots Exclusion Protocol
|
||||||
|
|
||||||
|
[rfc4287]: https://www.rfc-editor.org/rfc/rfc4287
|
||||||
|
[rfc9309]: https://www.rfc-editor.org/rfc/rfc9309
|
12
default.nix
Normal file
12
default.nix
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
{ toolchain, makeRustPlatform, pkg-config }:
|
||||||
|
|
||||||
|
let rustPlatform = makeRustPlatform { inherit (toolchain) cargo rustc; };
|
||||||
|
|
||||||
|
in rustPlatform.buildRustPackage {
|
||||||
|
name = "searchinblog";
|
||||||
|
src = ./.;
|
||||||
|
cargoLock.lockFile = ./Cargo.lock;
|
||||||
|
|
||||||
|
nativeBuildInputs = [ pkg-config ];
|
||||||
|
buildInputs = [ ];
|
||||||
|
}
|
8
docker-compose.yml
Normal file
8
docker-compose.yml
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
version: "3"
|
||||||
|
|
||||||
|
services:
|
||||||
|
db:
|
||||||
|
image: postgres
|
||||||
|
ports: [5433:5432]
|
||||||
|
environment:
|
||||||
|
POSTGRES_PASSWORD: example
|
93
flake.lock
Normal file
93
flake.lock
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"fenix": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": "nixpkgs",
|
||||||
|
"rust-analyzer-src": "rust-analyzer-src"
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1674541370,
|
||||||
|
"narHash": "sha256-L62dKDX6fIUQhlna9R8PKSAEGZ7ueU5gRGxZzZs/Zx8=",
|
||||||
|
"owner": "nix-community",
|
||||||
|
"repo": "fenix",
|
||||||
|
"rev": "4435f8e9da13581e51ba1f92a25d7d54c776ad94",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nix-community",
|
||||||
|
"repo": "fenix",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"flake-utils": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1667395993,
|
||||||
|
"narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"id": "flake-utils",
|
||||||
|
"type": "indirect"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1674211260,
|
||||||
|
"narHash": "sha256-xU6Rv9sgnwaWK7tgCPadV6HhI2Y/fl4lKxJoG2+m9qs=",
|
||||||
|
"owner": "nixos",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "5ed481943351e9fd354aeb557679624224de38d5",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nixos",
|
||||||
|
"ref": "nixos-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs_2": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1667629849,
|
||||||
|
"narHash": "sha256-P+v+nDOFWicM4wziFK9S/ajF2lc0N2Rg9p6Y35uMoZI=",
|
||||||
|
"owner": "nixos",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "3bacde6273b09a21a8ccfba15586fb165078fb62",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"id": "nixpkgs",
|
||||||
|
"type": "indirect"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"fenix": "fenix",
|
||||||
|
"flake-utils": "flake-utils",
|
||||||
|
"nixpkgs": "nixpkgs_2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"rust-analyzer-src": {
|
||||||
|
"flake": false,
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1674491573,
|
||||||
|
"narHash": "sha256-1hMPOn2dlMfWRWvuaWcSxNquKpvjGVXq2rVw6UJy75Q=",
|
||||||
|
"owner": "rust-lang",
|
||||||
|
"repo": "rust-analyzer",
|
||||||
|
"rev": "c552e5a55f13b2f08d506bb46fb74dbc11702d0d",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "rust-lang",
|
||||||
|
"ref": "nightly",
|
||||||
|
"repo": "rust-analyzer",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
28
flake.nix
Normal file
28
flake.nix
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
{
|
||||||
|
inputs = { fenix.url = "github:nix-community/fenix"; };
|
||||||
|
|
||||||
|
outputs = { self, nixpkgs, flake-utils, fenix }:
|
||||||
|
flake-utils.lib.eachDefaultSystem (system:
|
||||||
|
let
|
||||||
|
pkgs = import nixpkgs {
|
||||||
|
inherit system;
|
||||||
|
overlays = [ fenix.overlays.default ];
|
||||||
|
};
|
||||||
|
|
||||||
|
toolchain = pkgs.fenix.stable;
|
||||||
|
|
||||||
|
flakePkgs = {
|
||||||
|
searchinblog = pkgs.callPackage ./. { inherit toolchain; };
|
||||||
|
};
|
||||||
|
in rec {
|
||||||
|
packages = flake-utils.lib.flattenTree flakePkgs;
|
||||||
|
defaultPackage = packages.searchinblog;
|
||||||
|
|
||||||
|
devShell = pkgs.mkShell {
|
||||||
|
inputsFrom = with packages; [ searchinblog ];
|
||||||
|
packages = (with pkgs; [ cargo-watch cargo-deny cargo-edit sqlx-cli ])
|
||||||
|
++ (with toolchain; [ cargo rustc rustfmt ]);
|
||||||
|
CARGO_UNSTABLE_SPARSE_REGISTRY = "true";
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
34
migrations/20230125084829_initial.sql
Normal file
34
migrations/20230125084829_initial.sql
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
CREATE TABLE websites (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
domain TEXT UNIQUE,
|
||||||
|
last_collected TIMESTAMP WITHOUT TIME ZONE
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE posts (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
url TEXT UNIQUE,
|
||||||
|
last_updated_by_version TEXT,
|
||||||
|
content TEXT,
|
||||||
|
content_tsv TSVECTOR
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX "posts_fti_idx" ON posts USING GIN(content_tsv);
|
||||||
|
CREATE TRIGGER "posts_fti_trigger" AFTER UPDATE OR INSERT OR DELETE
|
||||||
|
ON posts FOR EACH ROW
|
||||||
|
EXECUTE PROCEDURE tsvector_update_trigger(posts_fti, content_tsv, 'pg_catalog.english', content);
|
||||||
|
|
||||||
|
CREATE TABLE authors (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
url TEXT UNIQUE
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE posts_keywords (
|
||||||
|
post_id INTEGER,
|
||||||
|
keyword TEXT,
|
||||||
|
|
||||||
|
PRIMARY KEY(post_id, keyword),
|
||||||
|
CONSTRAINT fk_post FOREIGN KEY(post_id) REFERENCES posts(id)
|
||||||
|
-- CONSTRAINT fk_keyword FOREIGN KEY(keyword) REFERENCES keywords(keyword)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX "posts_keywords_idx" ON posts_keywords(keyword);
|
2
rustfmt.toml
Normal file
2
rustfmt.toml
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
max_width = 80
|
||||||
|
tab_spaces = 2
|
54
src/crawler.rs
Normal file
54
src/crawler.rs
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
//! Crawler
|
||||||
|
//! ===
|
||||||
|
//!
|
||||||
|
//! The crawler design is built upon an important Postgres feature: Pubsub. It always waits on 2
|
||||||
|
//! events:
|
||||||
|
//!
|
||||||
|
//! - One that listens to Postgres pubsub triggers
|
||||||
|
//! - One that waits on a timer based on the next available job
|
||||||
|
//!
|
||||||
|
//! Once one of these events fires, it retrieves the job from the database and performs the crawl.
|
||||||
|
//! The crawl will try to perform a retrieval and populate the database and fill the full text
|
||||||
|
//! index.
|
||||||
|
|
||||||
|
use anyhow::Result;
|
||||||
|
use reqwest::Url;
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
|
||||||
|
pub async fn crawler() -> Result<()> {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Parser)]
|
||||||
|
pub struct CrawlSingleSiteOpts {
|
||||||
|
/// The domain to crawl
|
||||||
|
#[clap(name = "domain", long)]
|
||||||
|
domain: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Crawl a single site.
|
||||||
|
pub async fn crawl_single_site(opts: CrawlSingleSiteOpts) -> Result<()> {
|
||||||
|
let CrawlSingleSiteOpts { domain, .. } = opts;
|
||||||
|
|
||||||
|
// Check the robots.txt of the site to see if or how we can crawl
|
||||||
|
let url = Url::parse(&format!("https://{domain}/robots.txt"))?;
|
||||||
|
let response = reqwest::get(url).await?;
|
||||||
|
println!("Response: {response:?}");
|
||||||
|
|
||||||
|
// Try to look for an atom feed by parsing HTML headers
|
||||||
|
let url = Url::parse(&format!("https://{domain}"))?;
|
||||||
|
let response = reqwest::get(url).await?;
|
||||||
|
let body = response.text().await?;
|
||||||
|
let html = Html::parse_document(&body);
|
||||||
|
let tag = Selector::parse("link[type=\"application/atom+xml\"]").unwrap();
|
||||||
|
for element in html.select(&tag) {
|
||||||
|
let attrs = element.value().attrs();
|
||||||
|
for attr in attrs {
|
||||||
|
println!("Attrs: {attr:?}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Go through each of the files in the atom feed
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
47
src/main.rs
Normal file
47
src/main.rs
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
#[macro_use]
|
||||||
|
extern crate clap;
|
||||||
|
|
||||||
|
mod crawler;
|
||||||
|
|
||||||
|
use anyhow::Result;
|
||||||
|
use clap::Parser;
|
||||||
|
|
||||||
|
use crate::crawler::{crawl_single_site, crawler, CrawlSingleSiteOpts};
|
||||||
|
|
||||||
|
#[derive(Parser, Debug)]
|
||||||
|
struct Opt {
|
||||||
|
#[clap(subcommand)]
|
||||||
|
command: Subcommand,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Parser, Debug)]
|
||||||
|
enum Subcommand {
|
||||||
|
#[clap(name = "crawler")]
|
||||||
|
Crawler,
|
||||||
|
|
||||||
|
#[clap(name = "web")]
|
||||||
|
Web,
|
||||||
|
|
||||||
|
/// Crawl just a single site
|
||||||
|
#[clap(name = "crawl-single-site")]
|
||||||
|
CrawlSingleSite(CrawlSingleSiteOpts),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<()> {
|
||||||
|
let opt = Opt::parse();
|
||||||
|
|
||||||
|
println!("Hello, world! {opt:?}");
|
||||||
|
|
||||||
|
match opt.command {
|
||||||
|
Subcommand::Web => {}
|
||||||
|
Subcommand::Crawler => {
|
||||||
|
crawler().await?;
|
||||||
|
}
|
||||||
|
Subcommand::CrawlSingleSite(opts) => {
|
||||||
|
crawl_single_site(opts).await?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
Loading…
Reference in a new issue