CQCL
diff --git a/‎.github/workflows/build_test.yml
+8-9 b/‎.github/workflows/build_test.yml
+8-9
diff --git a/‎.github/workflows/docs.yml
-2 b/‎.github/workflows/docs.yml
-2
diff --git a/‎docs/conf.py
+2-2 b/‎docs/conf.py
+2-2
diff --git a/‎docs/examples/classical_pipeline.ipynb
+11-13 b/‎docs/examples/classical_pipeline.ipynb
+11-13
diff --git a/‎docs/examples/pennylane.ipynb
+675 b/‎docs/examples/pennylane.ipynb
+675
diff --git a/‎docs/examples/quantum_pipeline.ipynb
+23-23 b/‎docs/examples/quantum_pipeline.ipynb
+23-23
diff --git a/‎docs/examples/quantum_pipeline_jax.ipynb
+29-18 b/‎docs/examples/quantum_pipeline_jax.ipynb
+29-18
diff --git a/‎docs/examples/tokenisation.ipynb
+181 b/‎docs/examples/tokenisation.ipynb
+181
diff --git a/‎docs/glossary.rst
+4-1 b/‎docs/glossary.rst
+4-1
diff --git a/‎docs/index.rst
+12 b/‎docs/index.rst
+12
diff --git a/‎docs/models.rst
+56 b/‎docs/models.rst
+56
diff --git a/‎docs/nlp-class.rst
+70 b/‎docs/nlp-class.rst
+70
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ 3.8, 3.9, "3.10" ]
+        python-version: [ 3.8, 3.9, "3.10", "3.11" ]
     outputs:
       error-check: ${{ steps.error-check.conclusion }}
     steps:
@@ -47,16 +47,13 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ 3.8, 3.9, "3.10" ]
+        python-version: [ 3.8, 3.9, "3.10", "3.11" ]
     steps:
     - uses: actions/checkout@v3
     - name: Setup Python ${{ matrix.python-version }}
       uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install DisCoPy 0.5 from GitHub
-      if: github.ref_name != 'release' && github.ref_name != 'beta'
-      run: pip install git+https://github.com/discopy/discopy@0.5
     - name: Install base package
       run: pip install .
     - name: Check package import works
@@ -65,7 +62,7 @@ jobs:
       run: pip install .[extras] .[test]
     - name: Locate bobcat pre-trained model cache
       id: loc-bobcat-cache
-      run: echo "::set-output name=dir::$(python -c 'from lambeq.text2diagram.bobcat_parser import get_model_dir; print(get_model_dir("bert"))')"
+      run: echo "dir=$(python -c 'from lambeq.text2diagram.model_downloader import ModelDownloader; print(ModelDownloader("bert").model_dir)')" >> $GITHUB_OUTPUT
     - name: Restore bobcat pre-trained model from cache
       id: bobcat-cache
       uses: actions/cache@v2
@@ -82,18 +79,20 @@ jobs:
         --ignore=docs/extract_code_cells.py
     - name: Determine if depccg tests should be run
       # only test depccg if it is explicitly changed, since it is very slow
+      # tests are also disabled on Python 3.11
       id: depccg-enabled
       continue-on-error: true  # this is expected to fail but the job should still succeed
       run: >
-        git fetch --depth=1 origin ${{ github.base_ref || github.event.before }}:before
+        ${{ matrix.python-version != '3.11' }}
+        && git fetch --depth=1 origin ${{ github.base_ref || github.event.before }}:before
         && git diff --name-only before | grep depccg
     - name: Install depccg and locate depccg pre-trained model cache
       id: loc-depccg-cache
       if: steps.depccg-enabled.outcome == 'success'
       run: |
         pip install cython  # must be installed before depccg
         pip install depccg==2.0.3.2
-        echo "::set-output name=dir::$(python -c 'from depccg.instance_models import MODEL_DIRECTORY, MODELS; print(MODEL_DIRECTORY / MODELS["en"][1])')"
+        echo "dir=$(python -c 'from depccg.instance_models import MODEL_DIRECTORY, MODELS; print(MODEL_DIRECTORY / MODELS["en"][1])')" >> $GITHUB_OUTPUT
         pip install lambeq  # override dependency conflicts
     - name: Restore depccg pre-trained model from cache
       id: depccg-cache
@@ -117,7 +116,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ 3.8, 3.9, "3.10" ]
+        python-version: [ 3.8, 3.9, "3.10", "3.11" ]
     steps:
     - uses: actions/checkout@v3
     - name: Setup Python ${{ matrix.python-version }}
 
@@ -31,8 +31,6 @@ jobs:
         pip install -r docs/requirements.txt
     - name: Build documentation
       run: ${{ env.WORKFLOWS_DIR }}/build-docs
-    - name: Move install script
-      run: mv install.sh docs/_build/html
     - name: Deploy documentation
       if: ${{ github.event_name == 'push' && (github.ref_name == 'main' || github.ref_name == 'release') }}
       uses: s0/git-publish-subdir-action@develop
 
@@ -25,7 +25,7 @@
 
 
 project = 'lambeq'
-copyright = '2021-2022 Cambridge Quantum Computing Ltd.'
+copyright = '2021-2023 Cambridge Quantum Computing Ltd.'
 author = 'Cambridge Quantum QNLP Dev Team'
 
 # -- General configuration ---------------------------------------------------
@@ -34,9 +34,9 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'm2r2',
     'nbsphinx',
     'numpydoc',
+    'sphinx_mdinclude',
     'sphinx.ext.autodoc',
     'sphinx.ext.viewcode',
     'sphinx.ext.graphviz',
 
@@ -59,6 +59,9 @@ Glossary
     IQP circuit
         Instantaneous Quantum Polynomial. A circuit which interleaves layers of Hadamard :term:`quantum gates <quantum gate>` with diagonal unitaries.
 
+    loss function
+        In machine learning, a function that estimates how far the prediction of a :term:`model` is from its true value. The purpose of training is to minimise the loss over the training set.
+
     matrix product state (MPS)
         A factorization of a large tensor into a chain-like product of smaller tensors. ``lambeq`` is equipped with :term:`ansätze <ansatz (plural: ansätze)>` that implement various forms of matrix product states, allowing the execution of large :term:`tensor networks <tensor network>` on classical hardware.
 
@@ -81,7 +84,7 @@ Glossary
         A statistical tool that converts a sentence into a hierarchical representation that reflects the syntactic relationships between the words (a :term:`syntax tree`) based on a specific grammar formalism.
 
     PennyLane
-        A Python library for differentiable programming of quantum computers, developed by Xanadu, enabling quantum machine learning.
+        A Python library for differentiable programming of quantum computers, developed by Xanadu, enabling quantum machine learning. See more `here <https://pennylane.ai/qml/>`_.
 
     post-selection
         The act of conditioning the probability space on a particular event. In practice, this involves disregarding measurement outcomes where a particular qubit does not match the post-selected value.
 
@@ -23,6 +23,8 @@ User support
 
 If you need help with ``lambeq`` or you think you have found a bug, please send an email to lambeq-support@cambridgequantum.com. You can also open an issue at ``lambeq``'s `GitHub repository <https://github.com/CQCL/lambeq>`_. Someone from the development team will respond to you as soon as possible. Furthermore, if you want to subscribe to ``lambeq``'s mailing list (lambeq-users@cambridgequantum.com), send an email to lambeq-support@cambridgequantum.com to let us know.
 
+Note that the best way to get in touch with the QNLP community and learn about ``lambeq`` is to join our `QNLP discord server <https://discord.gg/TA63zghMrC>`_, where you can ask questions, get notified about important announcements and news, and chat with other QNLP researchers.
+
 Licence
 -------
 
@@ -54,6 +56,16 @@ If you use ``lambeq`` for your research, please cite the accompanying paper [Kea
    use-cases
    CONTRIBUTING
 
+.. toctree::
+   :caption: NLP-101
+   :maxdepth: 2
+
+   nlp-intro
+   nlp-data
+   nlp-class
+   nlp-ml
+   nlp-refs
+
 .. toctree::
    :caption: Tutorials
    :maxdepth: 2
 
@@ -47,6 +47,62 @@ To use the :py:class:`.NumpyModel` with ``jit`` mode, you need to install ``lamb
 
 - :ref:`uc1`
 
+.. _sec-pennylanemodel:
+
+PennyLaneModel
+--------------
+
+:py:class:`.PennyLaneModel` uses :term:`PennyLane` and :term:`PyTorch` to allow classical-quantum machine learning experiments. With ``probabilities=False``, :py:class:`.PennyLaneModel` performs a state vector simulation, while with ``probabilties=True`` it performs a probability simulation. The state vector and probability simulations correspond to DisCoPy's unitary and density matrix simulations.
+
+To run the model on real quantum hardware, ``probabilities=True`` must be used, so that the ``lambeq`` circuits are optimized using the parameter-shift rule to calculate the gradients.
+
+:py:class:`.PennyLaneModel` can be used to optimize simulated circuits using exact backpropagation with PyTorch, which may give improved results over using :py:class:`.NumpyModel` with :py:class:`.SPSAOptimizer`. However, this optimization process is not possible on real quantum hardware, so for more realistic results the parameter-shift rule should be preferred.
+
+To construct a hybrid model that passes the output of a circuit through a classical neural network, it is only necessary to subclass :py:class:`.PennyLaneModel` and modify the :py:meth:`~.PennyLaneModel.__init__` method to store the classical PyTorch parameters, and the :py:meth:`~.PennyLaneModel.forward` method to pass the result of :py:meth:`~.PennyLaneModel.get_diagram_output` to the neural network. For example:
+
+.. code-block:: python
+
+   import torch
+   from lambeq import PennyLaneModel
+
+   class MyCustomModel(PennyLaneModel):
+      def __init__(self, **kwargs):
+         super().__init__(**kwargs)
+         self.net = torch.nn.Linear(2, 2)
+
+      def forward(self, input):
+         preds = self.get_diagram_output(input)
+         return self.net(preds)
+
+This neural net can be real- or complex-valued, though this affects the non-linearities that can be used.
+
+:py:class:`.PennyLaneModel` can be used with the :py:class:`.PytorchTrainer`, or a standard PyTorch training loop.
+
+By using different backend configurations, :py:class:`.PennyLaneModel` can be used for several different use-cases, listed below:
+
+.. _tbl-plane-usecases:
+.. csv-table:: Backend configurations for different use cases.
+   :header: "Use case", "Configurations"
+   :widths: 25, 50
+
+   "Exact non :term:`shot-based <shots>` simulation with state outputs", "``{'backend': 'default.qubit', 'probabilities'=False}``"
+   "Exact non shot-based simulation with probability outputs", "``{'backend': 'default.qubit', 'probabilities'=True}``"
+   "Noiseless shot-based simulation", "``{'backend': 'default.qubit', 'shots'=1000, 'probabilities'=True}``"
+   "Noisy shot-based simulation on local hardware", "``{'backend': 'qiskit.aer', noise_model=my_noise_model, 'shots'=1000, 'probabilities'=True}``, where ``my_noise_model`` is an AER :py:class:`NoiseModel`."
+   "Noisy shot-based simulation on cloud-based emulators", "| ``{'backend': 'qiskit.ibmq', 'device'='ibmq_qasm_simulator', 'shots'=1000, 'probabilities'=True}``
+   | ``{'backend': 'honeywell.hqs', device=('H1-1E' or 'H1-2E'), 'shots'=1000, 'probabilities'=True}``"
+   "Evaluation of quantum circuits on a quantum computer", "| ``{'backend': 'qiskit.ibmq', 'device'='ibmq_hardware_device', 'shots'=1000, 'probabilities'=True}``, where ``ibmq_hardware_device`` is one that you have access to via your IBMQ account.
+   | ``{'backend': 'honeywell.hqs', device=('H1' or 'H1-1' or 'H1-2'), 'shots'=1000, 'probabilities'=True}``"
+
+All of these backends are compatible with hybrid quantum-classical models. Note that using quantum hardware or cloud-based emulators are much slower than local simulations.
+
+.. rubric:: See also the following use cases:
+
+- :ref:`uc1`
+- :ref:`uc2`
+- :ref:`uc3`
+- :ref:`uc5`
+
 .. _sec-pytorchmodel:
 
 PytorchModel
 
@@ -0,0 +1,70 @@
+Text classification
+===================
+
+One of the most fundamental tasks in NLP is text classification, which involves categorising textual data into predefined categories. It plays a vital role in a variety of NLP applications, including sentiment analysis, spam detection, topic modeling, and language identification, among others. By categorising texts into relevant categories, machines can analyse and derive insights from large volumes of textual data, making it possible to automate decision-making processes and perform tasks that would otherwise be time-consuming or impossible for humans to do.
+
+Binary vs multi-class classification
+------------------------------------
+
+Binary classification and multi-class classification involve assigning a label or category to an input data point. In `binary classification`, there are only two possible output categories, and the goal is to classify input data points into one of these two categories. For example, classifying emails as spam or not spam.
+
+On the other hand, `multi-class classification` involves assigning a data point to one of more than two possible output categories. For example, classifying images of animals into categories such as cats, dogs, and birds.
+
+Multi-class classification problems can be further divided into two subcategories: multi-class `single-label` classification and multi-class `multi-label` classification. In multi-class single-label classification, each input data point is assigned to one and only one output category. In contrast, in multi-class multi-label classification, each input data point can be assigned to one or more output categories simultaneously.
+
+In general, binary classification is a simpler and more straightforward problem to solve than multi-class classification, but multi-class classification problems are more representative of real-world scenarios where there are multiple possible categories to that a data point could belong.
+
+Loss functions
+--------------
+
+For binary classification tasks, the loss function of choice is binary cross-entropy. Below, :math:`y_i` is the true label for the :math:`i` th data point, :math:`p(y_i)` represents the probability that the model assigns to the specific label, and :math:`N` is the number of data points.
+
+.. math::
+
+   H(p, q) = -\frac{1}{N}\sum_{i=1}^N [y_i \log(p(y_i)) + (1-y_i) \log(1-p(y_i))]
+
+For multi-class classification, the loss function is usually the categorical version of cross-entropy. Here, :math:`M` is the number of classes, :math:`p(x_i)` is the true probability for the :math:`i` th class, and :math:`q(x_i)` the probability predicted by the model.
+
+.. math::
+
+   H(p, q) = -\sum_{i=1}^M p(x_i) \log(q(x_i))
+
+.. note::
+
+   ``lambeq`` provides a number of loss functions that can be used out-of-the-box during training, such as :py:class:`~.BinaryCrossEntropyLoss`, :py:class:`~.CrossEntropyLoss`, and :py:class:`~.MSELoss`.
+
+.. _sec-evaluation:
+
+Evaluation metrics
+------------------
+
+The most common metrics to evaluate the performance of classification models is accuracy, precision, recall, and F-score. Each metric has its own strengths and weaknesses, and can be useful in different contexts.
+
+- `Accuracy` is usually the standard way to evaluate classification, and it measures how often the model correctly predicts the class of an instance. It is calculated as the ratio of correct predictions to the total number of predictions. This metric can be useful when the classes in the dataset are balanced, meaning that there are roughly equal numbers of instances in each class. In this case, accuracy can provide a good overall measure of how well the model is performing.
+
+.. math::
+   \text{Accuracy} = \frac{\text{True Positives} + \text{True Negatives}}{\text{True Positives} + \text{True Negatives} + \text{False Positives} + \text{False Negatives}}
+
+- `Precision` is the proportion of true positive predictions among all positive predictions. It is expressed as the ratio of true positives to the total number of instances that the model predicts as positive. Precision is useful when the cost of false positives is high, such as in spam filtering or legal decision making.
+
+.. math::
+
+   \text{Precision} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Positives}}
+
+- `Recall`, also known as `sensitivity`, is the proportion of true positive predictions among all actual positive instances in the dataset. Recall is calculated as the ratio of true positives to the total number of instances of that class. It can be helpful when the goal of the model is to identify all instances of a particular class, such as in medical diagnosis or fraud detection.
+
+.. math::
+
+   \text{Recall} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}
+
+These two measures can be competing in the sense that increasing precision can decrease recall and vice versa. This trade-off occurs because precision and recall measure different aspects of the model's performance. High precision means that the model is accurate in its positive predictions, but it may miss some true positive instances, leading to lower recall. On the other hand, high recall means that the model identifies most of the positive instances, but it may have more false positives, leading to lower precision.
+
+To address this, researchers use `F-score`, also known as the `F1` score, which is a combined measure of precision and recall. It is calculated as the harmonic mean of precision and recall and provides a way to balance these two metrics. F-score is useful when both precision and recall are important and can be used to compare models that have different tradeoffs between these two metrics.
+
+.. math::
+
+   \text{F-score} = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}
+
+.. note::
+
+   For examples of text classification with ``lambeq``, see the :ref:`Training tutorial <sec-training>`.