<mxfile host="app.diagrams.net" modified="2026-06-10T00:00:00.000Z" agent="pi" version="24.7.17" type="device">
  <diagram id="ml-feature-pipeline" name="ML Feature Pipeline">
    <mxGraphModel dx="1700" dy="1680" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1700" pageHeight="1680" math="0" shadow="0">
      <root>
        <mxCell id="0" />
        <mxCell id="1" parent="0" />
        <mxCell id="title" value="&lt;b&gt;World Cup 2026 ML + Oracle Hybrid Retrieval Pipeline&lt;/b&gt;&lt;br/&gt;92-feature XGBoost built from chronological football trackers" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#e8f0ff;strokeColor=#4b6cb7;fontColor=#102033;fontFamily=Inter;fontSize=21;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="160" y="20" width="1380" height="76" as="geometry" /></mxCell><mxCell id="data" value="&lt;b&gt;Canonical Kaggle CSVs&lt;/b&gt;&lt;br/&gt;results.csv · goalscorers.csv · shootouts.csv&lt;br/&gt;49k+ matches, 47k+ goals, 675 shootouts" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#e7f7ed;strokeColor=#2f855a;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="45" y="135" width="310" height="116" as="geometry" /></mxCell><mxCell id="oracle" value="&lt;b&gt;Oracle AI Database&lt;/b&gt;&lt;br/&gt;MATCH_RESULTS · GOALSCORERS · SHOOTOUTS&lt;br/&gt;team statistics + competitive views" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fdecec;strokeColor=#c53030;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="410" y="135" width="330" height="116" as="geometry" /></mxCell><mxCell id="replay" value="&lt;b&gt;Chronological tracker replay&lt;/b&gt;&lt;br/&gt;Extract pre-match state → emit row → update trackers&lt;br/&gt;&lt;i&gt;No future leakage&lt;/i&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fff4e5;strokeColor=#dd6b20;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="795" y="135" width="370" height="116" as="geometry" /></mxCell><mxCell id="factory" value="&lt;b&gt;Feature factory&lt;/b&gt;&lt;br/&gt;Same tracker classes power training, cached predictions, and live &lt;code&gt;predict_match&lt;/code&gt; inference." style="rounded=1;whiteSpace=wrap;html=1;fillColor=#eef2ff;strokeColor=#5a67d8;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="1220" y="135" width="390" height="116" as="geometry" /></mxCell><mxCell id="feature_summary" value="Feature family extractors — 92 predictors assembled before each match update" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f8fafc;strokeColor=#cbd5e1;fontColor=#102033;fontFamily=Inter;fontSize=20;spacing=12;arcSize=12;fontStyle=1;" vertex="1" parent="1"><mxGeometry x="515" y="305" width="670" height="64" as="geometry" /></mxCell><mxCell id="row" value="&lt;b&gt;Final model row&lt;/b&gt;&lt;br/&gt;92 numeric predictors&lt;br/&gt;same names in &lt;code&gt;enhanced_features.ALL_FEATURES&lt;/code&gt; and &lt;code&gt;best_model.pkl&lt;/code&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#eef2ff;strokeColor=#5a67d8;fontColor=#102033;fontFamily=Inter;fontSize=17;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="630" y="410" width="440" height="112" as="geometry" /></mxCell><mxCell id="split" value="&lt;b&gt;Training protocol&lt;/b&gt;&lt;br/&gt;Use matches from 1990+&lt;br/&gt;time split: train &amp;lt; 2020, test ≥ 2020" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f7fafc;strokeColor=#4a5568;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="630" y="565" width="440" height="112" as="geometry" /></mxCell><mxCell id="models" value="&lt;b&gt;Model progression&lt;/b&gt;&lt;br/&gt;Decision Tree → Random Forest → XGBoost / LightGBM&lt;br/&gt;Optuna + interactions + ensemble experiments" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f7fafc;strokeColor=#4a5568;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="630" y="720" width="440" height="124" as="geometry" /></mxCell><mxCell id="artifact" value="&lt;b&gt;Production artifact&lt;/b&gt;&lt;br/&gt;models/best_model.pkl&lt;br/&gt;classes: Win · Draw · Loss" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#e6fffa;strokeColor=#319795;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="630" y="890" width="440" height="112" as="geometry" /></mxCell><mxCell id="outputs" value="&lt;b&gt;Inference outputs&lt;/b&gt;&lt;br/&gt;&lt;code&gt;predict_match&lt;/code&gt; live 92-feature row&lt;br/&gt;PREDICCIONES_FINAL cached matchups" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fdecec;strokeColor=#c53030;fontColor=#102033;fontFamily=Inter;fontSize=17;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="630" y="1045" width="440" height="124" as="geometry" /></mxCell><mxCell id="lc" value="&lt;b&gt;LangChain OracleVS hybrid store&lt;/b&gt;&lt;br/&gt;SOCCER_LANGCHAIN_DOCS via langchain-oracledb&lt;br/&gt;prediction docs + team facts + embeddings" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fdecec;strokeColor=#c53030;fontColor=#102033;fontFamily=Inter;fontSize=17;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="630" y="1215" width="440" height="132" as="geometry" /></mxCell><mxCell id="retrieval" value="&lt;b&gt;Final agent evidence path&lt;/b&gt;&lt;br/&gt;hybrid_retrieve = OracleHybridSearchRetriever when available&lt;br/&gt;fallback = Oracle Text + vector RRF" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#e8f0ff;strokeColor=#4b6cb7;fontColor=#102033;fontFamily=Inter;fontSize=17;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="630" y="1395" width="440" height="124" as="geometry" /></mxCell><mxCell id="f0" value="&lt;b&gt;Original baseline — 40&lt;/b&gt;&lt;br/&gt;Elo 8 · form/goals 19 · H2H 3 · context 10" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#eefbf3;strokeColor=#2f855a;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="55" y="410" width="480" height="112" as="geometry" /></mxCell><mxCell id="f1" value="&lt;b&gt;Goalscorer intelligence — 12&lt;/b&gt;&lt;br/&gt;scoring depth · star dependency · penalties · late goals · first-half share" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f0fff4;strokeColor=#38a169;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="55" y="565" width="480" height="112" as="geometry" /></mxCell><mxCell id="f4" value="&lt;b&gt;Venue / geography — 5&lt;/b&gt;&lt;br/&gt;altitude · high-altitude flag · confederation strength · intercontinental" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#faf5ff;strokeColor=#805ad5;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="55" y="720" width="480" height="112" as="geometry" /></mxCell><mxCell id="proof" value="&lt;b&gt;No leakage rule&lt;/b&gt;&lt;br/&gt;For every historical match: read tracker state first, emit one feature row, then update trackers with the result." style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fff7d6;strokeColor=#d6a800;fontColor=#102033;fontFamily=Inter;fontSize=16;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="55" y="890" width="480" height="112" as="geometry" /></mxCell><mxCell id="f2" value="&lt;b&gt;Momentum / psychology — 16&lt;/b&gt;&lt;br/&gt;streaks · unbeaten · clean sheets · comebacks · draw tendency · blowouts" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fffaf0;strokeColor=#dd6b20;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="1165" y="410" width="480" height="112" as="geometry" /></mxCell><mxCell id="f3" value="&lt;b&gt;Poisson xG — 8&lt;/b&gt;&lt;br/&gt;home/away λ · win/draw probabilities · variance · overperformance" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#ebf8ff;strokeColor=#3182ce;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="1165" y="565" width="480" height="112" as="geometry" /></mxCell><mxCell id="f5" value="&lt;b&gt;Tournament context — 11&lt;/b&gt;&lt;br/&gt;World Cup form · competitive form · big-game factor · WC experience" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fef5e7;strokeColor=#d69e2e;fontColor=#102033;fontFamily=Inter;fontSize=18;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="1165" y="720" width="480" height="112" as="geometry" /></mxCell><mxCell id="elo" value="&lt;b&gt;Elo scoring note&lt;/b&gt;&lt;br/&gt;Start 1500 · home +100 · K: WC 60 / continental 50 / qualifier 40 / friendly 20&lt;br/&gt;R_new = R_old + K×G×(actual−expected)" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fff7d6;strokeColor=#d6a800;fontColor=#102033;fontFamily=Inter;fontSize=16;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="1165" y="890" width="480" height="132" as="geometry" /></mxCell><mxCell id="callout" value="&lt;b&gt;Workshop proof point&lt;/b&gt;&lt;br/&gt;Spain vs Brazil uses live 92-feature inference and can retrieve the matching prediction document from OracleVS hybrid search." style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fff7d6;strokeColor=#d6a800;fontColor=#102033;fontFamily=Inter;fontSize=16;spacing=12;arcSize=12;" vertex="1" parent="1"><mxGeometry x="410" y="1565" width="880" height="82" as="geometry" /></mxCell>
        <mxCell id="e1" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=1;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=3;strokeColor=#334155;endArrow=block;endFill=1;" edge="1" parent="1" source="data" target="oracle"><mxGeometry relative="1" as="geometry" /></mxCell><mxCell id="e2" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=1;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=3;strokeColor=#334155;endArrow=block;endFill=1;" edge="1" parent="1" source="oracle" target="replay"><mxGeometry relative="1" as="geometry" /></mxCell><mxCell id="e3" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=1;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=3;strokeColor=#334155;endArrow=block;endFill=1;" edge="1" parent="1" source="replay" target="factory"><mxGeometry relative="1" as="geometry" /></mxCell><mxCell id="e4" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=1;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=3;strokeColor=#334155;endArrow=block;endFill=1;" edge="1" parent="1" source="replay" target="feature_summary"><mxGeometry relative="1" as="geometry" /></mxCell><mxCell id="e5" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=1;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=3;strokeColor=#334155;endArrow=block;endFill=1;" edge="1" parent="1" source="feature_summary" target="row"><mxGeometry relative="1" as="geometry" /></mxCell><mxCell id="e6" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=1;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=3;strokeColor=#334155;endArrow=block;endFill=1;" edge="1" parent="1" source="row" target="split"><mxGeometry relative="1" as="geometry" /></mxCell><mxCell id="e7" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=1;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=3;strokeColor=#334155;endArrow=block;endFill=1;" edge="1" parent="1" source="split" target="models"><mxGeometry relative="1" as="geometry" /></mxCell><mxCell id="e8" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=1;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=3;strokeColor=#334155;endArrow=block;endFill=1;" edge="1" parent="1" source="models" target="artifact"><mxGeometry relative="1" as="geometry" /></mxCell><mxCell id="e9" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=1;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=3;strokeColor=#334155;endArrow=block;endFill=1;" edge="1" parent="1" source="artifact" target="outputs"><mxGeometry relative="1" as="geometry" /></mxCell><mxCell id="e10" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=1;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=3;strokeColor=#334155;endArrow=block;endFill=1;" edge="1" parent="1" source="outputs" target="lc"><mxGeometry relative="1" as="geometry" /></mxCell><mxCell id="e11" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=1;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=3;strokeColor=#334155;endArrow=block;endFill=1;" edge="1" parent="1" source="lc" target="retrieval"><mxGeometry relative="1" as="geometry" /></mxCell><mxCell id="e12" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=1;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=3;strokeColor=#334155;endArrow=block;endFill=1;" edge="1" parent="1" source="retrieval" target="callout"><mxGeometry relative="1" as="geometry" /></mxCell>
      </root>
    </mxGraphModel>
  </diagram>
</mxfile>
