Requête Postgres douloureusement lente utilisant WHERE sur de nombreuses lignes adjacentes

Étape 1 :utilisez une fonction de fenêtre pour obtenir adjacent enregistrements, en évitant la douloureuse auto-jointure (12 tables est très proche de la limite où geqo prend le relais):

copy(
WITH stuff AS (
    SELECT   c1.id , c1.source, c1.word
    , LEAD ( c1.word, 1) OVER (www) AS c2w
    , LEAD (c1.word, 2) OVER (www) AS c3w
    , LEAD ( c1.word, 3) OVER (www) AS c4w
    , LEAD (c1.lemma, 3) OVER (www) AS c4l
    , LEAD (c1.pos, 3) OVER (www) AS c4p
    , LEAD (c1.pos, 4) OVER (www) AS c5p
    , LEAD (c1.word, 4) OVER (www) AS c5w
    , LEAD (c1.word, 5) OVER (www) AS c6w
    , LEAD (c1.lemma, 5) OVER (www) AS c6l
    , LEAD (c1.word, 6) OVER (www) AS c7w
    , LEAD (c1.pos, 6) OVER (www) AS c7p
    , LEAD (c1.word, 7) OVER (www) AS c8w
    , LEAD (c1.word, 8) OVER (www) AS c9w
    , LEAD (c1.lemma, 8) OVER (www) AS c9l
    , LEAD (c1.pos, 8) OVER (www) AS c9p
    , LEAD (c1.word, 9) OVER (www) AS c10w
    , LEAD (c1.word, 10) OVER (www) AS c11w
    FROM orderedflatcorpus AS c1
    WINDOW www AS (ORDER BY id)
    )
SELECT id ,  source, word
    , c2w
    , c3w
    , c4w
    , c4l
    , c4p
    , c5w
    , c6w
    , c7w
    , c8w
    , c9w
    , c9l
    , c9p
    , c10w
    , c11w
FROM stuff
WHERE 1=1
AND c4p LIKE 'v%'
AND c5p = 'appge'
AND c6l = 'way'
AND c7p LIKE 'i%'
AND c8w = 'the'
AND c9p LIKE 'n%'
ORDER BY id
)
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
TO '/tmp/OUTPUT2.csv' DELIMITER E'\t' csv header;

Étape 2 : [modèle de données] Les colonnes {mot, lemme, pos} semblent être un groupe à faible cardinalité, vous pouvez les compresser dans un jeton/lemme/pos-table séparé :

    -- An index to speedup the unique extraction and final update
    -- (the index will be dropped automatically
    -- once the columns are dropped)
    CREATE INDEX ON tmp.orderedflatcorpus (word, lemma, pos );

    ANALYZE tmp.orderedflatcorpus;
    -- table containing the "squeezed out" domain
    CREATE TABLE tmp.words AS
     SELECT DISTINCT  word, lemma, pos
     FROM tmp.orderedflatcorpus
            ;
    ALTER TABLE tmp.words
     ADD COLUMN id SERIAL NOT NULL PRIMARY KEY;

    ALTER TABLE tmp.words
     ADD UNIQUE (word , lemma, pos );

    -- The original table needs an FK "link" to the new table
    ALTER TABLE tmp.orderedflatcorpus
      ADD column words_id INTEGER -- NOT NULL
      REFERENCES tmp.words(id)
      ;
    -- FK constraints are helped a lot by a supportive index.
    CREATE INDEX orderedflatcorpus_words_id_fk ON tmp.orderedflatcorpus (words_id)
     ;
    ANALYZE tmp.orderedflatcorpus;
    ANALYZE tmp.words;
    -- Initialize the FK column in the original table.
    --  we need NOT DISTINCT FROM here, since the joined
    --  columns could contain NULLs , which MUST compare equal.
    -- ------------------------------------------------------
    UPDATE tmp.orderedflatcorpus dst
       SET  words_id = src.id
      FROM tmp.words src
     WHERE src.word IS NOT DISTINCT FROM dst.word
       AND dst.lemma IS NOT DISTINCT FROM src.lemma
       AND dst.pos IS NOT DISTINCT FROM src.pos
            ;
    ALTER TABLE tmp.orderedflatcorpus
     DROP column word
     , DROP column lemma
     , DROP column pos
            ;

Et la nouvelle requête, avec un JOIN à la table de mots :

copy(
WITH stuff AS (
    SELECT   c1.id , c1.source, w.word
    , LEAD ( w.word, 1) OVER (www) AS c2w
    , LEAD (w.word, 2) OVER (www) AS c3w
    , LEAD ( w.word, 3) OVER (www) AS c4w
    , LEAD (w.lemma, 3) OVER (www) AS c4l
    , LEAD (w.pos, 3) OVER (www) AS c4p
    , LEAD (w.pos, 4) OVER (www) AS c5p
    , LEAD (w.word, 4) OVER (www) AS c5w
    , LEAD (w.word, 5) OVER (www) AS c6w
    , LEAD (w.lemma, 5) OVER (www) AS c6l
    , LEAD (w.word, 6) OVER (www) AS c7w
    , LEAD (w.pos, 6) OVER (www) AS c7p
    , LEAD (w.word, 7) OVER (www) AS c8w
    , LEAD (w.word, 8) OVER (www) AS c9w
    , LEAD (w.lemma, 8) OVER (www) AS c9l
    , LEAD (w.pos, 8) OVER (www) AS c9p
    , LEAD (w.word, 9) OVER (www) AS c10w
    , LEAD (w.word, 10) OVER (www) AS c11w
    FROM orderedflatcorpus AS c1
    JOIN words w ON w.id=c1.words_id
    WINDOW www AS (ORDER BY c1.id)
    )
SELECT id ,  source, word
    , c2w , c3w
    , c4w , c4l , c4p
    , c5w
    , c6w
    , c7w
    , c8w
    , c9w , c9l , c9p
    , c10w
    , c11w
FROM stuff
WHERE 1=1
AND c4p LIKE 'v%'
AND c5p = 'appge'
AND c6l = 'way'
AND c7p LIKE 'i%'
AND c8w = 'the'
AND c9p LIKE 'n%'
ORDER BY id
)
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
TO '/tmp/OUTPUT3.csv' DELIMITER E'\t' csv header;

Remarque :j'obtiens deux lignes en sortie, car j'ai un peu trop assoupli les conditions...

Mettre à jour :la première requête, en évitant le CTE

copy(
SELECT id ,  source, word
        , c2w
        , c3w
        , c4w
        , c4l
        , c4p
        , c5w
        , c6w
        , c7w
        , c8w
        , c9w
        , c9l
        , c9p
        , c10w
        , c11w
FROM (
        SELECT   c1.id , c1.source, c1.word
        , LEAD ( c1.word, 1) OVER (www) AS c2w
        , LEAD (c1.word, 2) OVER (www) AS c3w
        , LEAD ( c1.word, 3) OVER (www) AS c4w
        , LEAD (c1.lemma, 3) OVER (www) AS c4l
        , LEAD (c1.pos, 3) OVER (www) AS c4p
        , LEAD (c1.pos, 4) OVER (www) AS c5p
        , LEAD (c1.word, 4) OVER (www) AS c5w
        , LEAD (c1.word, 5) OVER (www) AS c6w
        , LEAD (c1.lemma, 5) OVER (www) AS c6l
        , LEAD (c1.word, 6) OVER (www) AS c7w
        , LEAD (c1.pos, 6) OVER (www) AS c7p
        , LEAD (c1.word, 7) OVER (www) AS c8w
        , LEAD (c1.word, 8) OVER (www) AS c9w
        , LEAD (c1.lemma, 8) OVER (www) AS c9l
        , LEAD (c1.pos, 8) OVER (www) AS c9p
        , LEAD (c1.word, 9) OVER (www) AS c10w
        , LEAD (c1.word, 10) OVER (www) AS c11w
        FROM orderedflatcorpus AS c1
        WINDOW www AS (ORDER BY id)
        ) stuff
WHERE 1=1
AND c4p LIKE 'v%'
AND c5p = 'appge'
AND c6l = 'way'
AND c7p LIKE 'i%'
AND c8w = 'the'
AND c9p LIKE 'n%'
ORDER BY id
)
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
TO '/tmp/OUTPUT2a.csv' DELIMITER E'\t' csv header;

[une transformation similaire pourrait être effectuée sur la deuxième requête]

MISE À JOUR2 La version de la sous-requête pour la variante à deux tables.

-- copy(
-- EXPLAIN ANALYZE
SELECT c1i, c1s, c1w
        , c2w , c3w
        , c4w , c4l , c4p
        , c5w
        , c6w
        , c7w
        , c8w
        , c9w , c9l , c9p
        , c10w
        , c11w
FROM (
        SELECT c1.id AS c1i
        , c1.source AS c1s
        , w1.word AS c1w
        , LEAD (w1.word, 1) OVER www AS c2w
        , LEAD (w1.word, 2) OVER www AS c3w
        , LEAD (w1.word, 3) OVER www AS c4w
        , LEAD (w1.lemma, 3) OVER www AS c4l
        , LEAD (w1.pos, 3) OVER www AS c4p
        , LEAD (w1.pos, 4) OVER www AS c5p
        , LEAD (w1.word, 4) OVER www AS c5w
        , LEAD (w1.word, 5) OVER www AS c6w
        , LEAD (w1.lemma, 5) OVER www AS c6l
        , LEAD (w1.word, 6) OVER www AS c7w
        , LEAD (w1.pos, 6) OVER www AS c7p
        , LEAD (w1.word, 7) OVER www AS c8w
        , LEAD (w1.word, 8) OVER www AS c9w
        , LEAD (w1.lemma, 8) OVER www AS c9l
        , LEAD (w1.pos, 8) OVER www AS c9p
        , LEAD (w1.word, 9) OVER www AS c10w
        , LEAD (w1.word, 10) OVER www AS c11w
        FROM orderedflatcorpus c1
        JOIN words w1 ON w1.id=c1.words_id
        WHERE 1=1
/*      These *could* to prune out unmatched items, but I could not get it to work ...
        AND EXISTS (SELECT *FROM orderedflatcorpus c4 JOIN words w4 ON w4.id=c4.words_id
                WHERE c4.id = 3+c1.id -- AND w4.pos LIKE 'v%'
                )  -- OMG
        AND EXISTS (SELECT *FROM orderedflatcorpus c5 JOIN words w5 ON w5.id=c5.words_id
                WHERE c5.id = 4+c1.id -- AND w5.pos = 'appge'
                ) -- OMG
        AND EXISTS (SELECT *FROM orderedflatcorpus c7 JOIN words w7 ON w7.id=c7.words_id
                WHERE c7.id = 6+c1.id -- AND w7.pos LIKE 'i%'
                ) -- OMG
        AND EXISTS (SELECT *FROM orderedflatcorpus c9 JOIN words w9 ON w9.id=c9.words_id
                WHERE c9.id = 8+c1.id -- AND w9.pos LIKE 'n%'
                ) -- OMG
        AND EXISTS (SELECT *FROM orderedflatcorpus c8 JOIN words w8 ON w8.id=c8.words_id
                WHERE c8.id = 7+c1.id -- AND w8.word = 'the'
                )  -- OMG
*/
         WINDOW www AS (ORDER BY c1.id ROWS BETWEEN CURRENT ROW AND 10 FOLLOWING)
        ) stuff
WHERE 1=1
AND c4p LIKE 'v%'
AND c5p = 'appge'
AND c6l = 'way'
AND c7p LIKE 'i%'
AND c8w = 'the'
AND c9p LIKE 'n%'
ORDER BY c1i
        ;
   -- )
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
-- TO '/tmp/OUTPUT3b.csv' DELIMITER E'\t' csv header;