drivers/block/drbd/drbd_nl.c

   1 /*
   2    drbd_nl.c
   3
   4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10    drbd is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2, or (at your option)
  13    any later version.
  14
  15    drbd is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with drbd; see the file COPYING.  If not, write to
  22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23
  24  */
  25
  26 #include <linux/module.h>
  27 #include <linux/drbd.h>
  28 #include <linux/in.h>
  29 #include <linux/fs.h>
  30 #include <linux/file.h>
  31 #include <linux/slab.h>
  32 #include <linux/blkpg.h>
  33 #include <linux/cpumask.h>
  34 #include "drbd_int.h"
  35 #include "drbd_protocol.h"
  36 #include "drbd_req.h"
  37 #include "drbd_wrappers.h"
  38 #include <asm/unaligned.h>
  39 #include <linux/drbd_limits.h>
  40 #include <linux/kthread.h>
  41
  42 #include <net/genetlink.h>
  43
  44 /* .doit */
  45 // int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
  46 // int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
  47
  48 int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info);
  49 int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info);
  50
  51 int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
  52 int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
  53 int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
  54
  55 int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
  56 int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
  57 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
  58 int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
  59 int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
  60 int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
  61 int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
  62 int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
  63 int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
  64 int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
  65 int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
  66 int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
  67 int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
  68 int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
  69 int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
  70 int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
  71 int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
  72 int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
  73 int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
  74 int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
  75 /* .dumpit */
  76 int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
  77
  78 #include <linux/drbd_genl_api.h>
  79 #include "drbd_nla.h"
  80 #include <linux/genl_magic_func.h>
  81
  82 /* used blkdev_get_by_path, to claim our meta data device(s) */
  83 static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
  84
  85 /* Configuration is strictly serialized, because generic netlink message
  86  * processing is strictly serialized by the genl_lock().
  87  * Which means we can use one static global drbd_config_context struct.
  88  */
  89 static struct drbd_config_context {
  90         /* assigned from drbd_genlmsghdr */
  91         unsigned int minor;
  92         /* assigned from request attributes, if present */
  93         unsigned int volume;
  94 #define VOLUME_UNSPECIFIED              (-1U)
  95         /* pointer into the request skb,
  96          * limited lifetime! */
  97         char *resource_name;
  98         struct nlattr *my_addr;
  99         struct nlattr *peer_addr;
 100
 101         /* reply buffer */
 102         struct sk_buff *reply_skb;
 103         /* pointer into reply buffer */
 104         struct drbd_genlmsghdr *reply_dh;
 105         /* resolved from attributes, if possible */
 106         struct drbd_device *device;
 107         struct drbd_resource *resource;
 108         struct drbd_connection *connection;
 109 } adm_ctx;
 110
 111 static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
 112 {
 113         genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
 114         if (genlmsg_reply(skb, info))
 115                 printk(KERN_ERR "drbd: error sending genl reply\n");
 116 }
 117
 118 /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
 119  * reason it could fail was no space in skb, and there are 4k available. */
 120 int drbd_msg_put_info(const char *info)
 121 {
 122         struct sk_buff *skb = adm_ctx.reply_skb;
 123         struct nlattr *nla;
 124         int err = -EMSGSIZE;
 125
 126         if (!info || !info[0])
 127                 return 0;
 128
 129         nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
 130         if (!nla)
 131                 return err;
 132
 133         err = nla_put_string(skb, T_info_text, info);
 134         if (err) {
 135                 nla_nest_cancel(skb, nla);
 136                 return err;
 137         } else
 138                 nla_nest_end(skb, nla);
 139         return 0;
 140 }
 141
 142 /* This would be a good candidate for a "pre_doit" hook,
 143  * and per-family private info->pointers.
 144  * But we need to stay compatible with older kernels.
 145  * If it returns successfully, adm_ctx members are valid.
 146  */
 147 #define DRBD_ADM_NEED_MINOR     1
 148 #define DRBD_ADM_NEED_RESOURCE  2
 149 #define DRBD_ADM_NEED_CONNECTION 4
 150 static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
 151                 unsigned flags)
 152 {
 153         struct drbd_genlmsghdr *d_in = info->userhdr;
 154         const u8 cmd = info->genlhdr->cmd;
 155         int err;
 156
 157         memset(&adm_ctx, 0, sizeof(adm_ctx));
 158
 159         /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
 160         if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
 161                return -EPERM;
 162
 163         adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
 164         if (!adm_ctx.reply_skb) {
 165                 err = -ENOMEM;
 166                 goto fail;
 167         }
 168
 169         adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb,
 170                                         info, &drbd_genl_family, 0, cmd);
 171         /* put of a few bytes into a fresh skb of >= 4k will always succeed.
 172          * but anyways */
 173         if (!adm_ctx.reply_dh) {
 174                 err = -ENOMEM;
 175                 goto fail;
 176         }
 177
 178         adm_ctx.reply_dh->minor = d_in->minor;
 179         adm_ctx.reply_dh->ret_code = NO_ERROR;
 180
 181         adm_ctx.volume = VOLUME_UNSPECIFIED;
 182         if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
 183                 struct nlattr *nla;
 184                 /* parse and validate only */
 185                 err = drbd_cfg_context_from_attrs(NULL, info);
 186                 if (err)
 187                         goto fail;
 188
 189                 /* It was present, and valid,
 190                  * copy it over to the reply skb. */
 191                 err = nla_put_nohdr(adm_ctx.reply_skb,
 192                                 info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
 193                                 info->attrs[DRBD_NLA_CFG_CONTEXT]);
 194                 if (err)
 195                         goto fail;
 196
 197                 /* and assign stuff to the global adm_ctx */
 198                 nla = nested_attr_tb[__nla_type(T_ctx_volume)];
 199                 if (nla)
 200                         adm_ctx.volume = nla_get_u32(nla);
 201                 nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
 202                 if (nla)
 203                         adm_ctx.resource_name = nla_data(nla);
 204                 adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
 205                 adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
 206                 if ((adm_ctx.my_addr &&
 207                      nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.connection->my_addr)) ||
 208                     (adm_ctx.peer_addr &&
 209                      nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.connection->peer_addr))) {
 210                         err = -EINVAL;
 211                         goto fail;
 212                 }
 213         }
 214
 215         adm_ctx.minor = d_in->minor;
 216         adm_ctx.device = minor_to_device(d_in->minor);
 217         if (adm_ctx.resource_name) {
 218                 adm_ctx.resource = drbd_find_resource(adm_ctx.resource_name);
 219                 if (adm_ctx.resource) {
 220                         adm_ctx.connection = first_connection(adm_ctx.resource);
 221                         kref_get(&adm_ctx.connection->kref);
 222                 }
 223         }
 224
 225         if (!adm_ctx.device && (flags & DRBD_ADM_NEED_MINOR)) {
 226                 drbd_msg_put_info("unknown minor");
 227                 return ERR_MINOR_INVALID;
 228         }
 229         if (!adm_ctx.resource && (flags & DRBD_ADM_NEED_RESOURCE)) {
 230                 drbd_msg_put_info("unknown resource");
 231                 if (adm_ctx.resource_name)
 232                         return ERR_RES_NOT_KNOWN;
 233                 return ERR_INVALID_REQUEST;
 234         }
 235
 236         if (flags & DRBD_ADM_NEED_CONNECTION) {
 237                 if (adm_ctx.connection && !(flags & DRBD_ADM_NEED_RESOURCE)) {
 238                         drbd_msg_put_info("no resource name expected");
 239                         return ERR_INVALID_REQUEST;
 240                 }
 241                 if (adm_ctx.device) {
 242                         drbd_msg_put_info("no minor number expected");
 243                         return ERR_INVALID_REQUEST;
 244                 }
 245                 if (adm_ctx.my_addr && adm_ctx.peer_addr)
 246                         adm_ctx.connection = conn_get_by_addrs(nla_data(adm_ctx.my_addr),
 247                                                           nla_len(adm_ctx.my_addr),
 248                                                           nla_data(adm_ctx.peer_addr),
 249                                                           nla_len(adm_ctx.peer_addr));
 250                 if (!adm_ctx.connection) {
 251                         drbd_msg_put_info("unknown connection");
 252                         return ERR_INVALID_REQUEST;
 253                 }
 254         }
 255
 256         /* some more paranoia, if the request was over-determined */
 257         if (adm_ctx.device && adm_ctx.resource &&
 258             adm_ctx.device->resource != adm_ctx.resource) {
 259                 pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n",
 260                                 adm_ctx.minor, adm_ctx.resource->name,
 261                                 adm_ctx.device->resource->name);
 262                 drbd_msg_put_info("minor exists in different resource");
 263                 return ERR_INVALID_REQUEST;
 264         }
 265         if (adm_ctx.device &&
 266             adm_ctx.volume != VOLUME_UNSPECIFIED &&
 267             adm_ctx.volume != adm_ctx.device->vnr) {
 268                 pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
 269                                 adm_ctx.minor, adm_ctx.volume,
 270                                 adm_ctx.device->vnr,
 271                                 adm_ctx.device->resource->name);
 272                 drbd_msg_put_info("minor exists as different volume");
 273                 return ERR_INVALID_REQUEST;
 274         }
 275
 276         return NO_ERROR;
 277
 278 fail:
 279         nlmsg_free(adm_ctx.reply_skb);
 280         adm_ctx.reply_skb = NULL;
 281         return err;
 282 }
 283
 284 static int drbd_adm_finish(struct genl_info *info, int retcode)
 285 {
 286         if (adm_ctx.connection) {
 287                 kref_put(&adm_ctx.connection->kref, drbd_destroy_connection);
 288                 adm_ctx.connection = NULL;
 289         }
 290         if (adm_ctx.resource) {
 291                 kref_put(&adm_ctx.resource->kref, drbd_destroy_resource);
 292                 adm_ctx.resource = NULL;
 293         }
 294
 295         if (!adm_ctx.reply_skb)
 296                 return -ENOMEM;
 297
 298         adm_ctx.reply_dh->ret_code = retcode;
 299         drbd_adm_send_reply(adm_ctx.reply_skb, info);
 300         return 0;
 301 }
 302
 303 static void setup_khelper_env(struct drbd_connection *connection, char **envp)
 304 {
 305         char *afs;
 306
 307         /* FIXME: A future version will not allow this case. */
 308         if (connection->my_addr_len == 0 || connection->peer_addr_len == 0)
 309                 return;
 310
 311         switch (((struct sockaddr *)&connection->peer_addr)->sa_family) {
 312         case AF_INET6:
 313                 afs = "ipv6";
 314                 snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
 315                          &((struct sockaddr_in6 *)&connection->peer_addr)->sin6_addr);
 316                 break;
 317         case AF_INET:
 318                 afs = "ipv4";
 319                 snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
 320                          &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
 321                 break;
 322         default:
 323                 afs = "ssocks";
 324                 snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
 325                          &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
 326         }
 327         snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
 328 }
 329
 330 int drbd_khelper(struct drbd_device *device, char *cmd)
 331 {
 332         char *envp[] = { "HOME=/",
 333                         "TERM=linux",
 334                         "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 335                          (char[20]) { }, /* address family */
 336                          (char[60]) { }, /* address */
 337                         NULL };
 338         char mb[12];
 339         char *argv[] = {usermode_helper, cmd, mb, NULL };
 340         struct drbd_connection *connection = first_peer_device(device)->connection;
 341         struct sib_info sib;
 342         int ret;
 343
 344         if (current == connection->worker.task)
 345                 set_bit(CALLBACK_PENDING, &connection->flags);
 346
 347         snprintf(mb, 12, "minor-%d", device_to_minor(device));
 348         setup_khelper_env(connection, envp);
 349
 350         /* The helper may take some time.
 351          * write out any unsynced meta data changes now */
 352         drbd_md_sync(device);
 353
 354         dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
 355         sib.sib_reason = SIB_HELPER_PRE;
 356         sib.helper_name = cmd;
 357         drbd_bcast_event(device, &sib);
 358         ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 359         if (ret)
 360                 dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
 361                                 usermode_helper, cmd, mb,
 362                                 (ret >> 8) & 0xff, ret);
 363         else
 364                 dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
 365                                 usermode_helper, cmd, mb,
 366                                 (ret >> 8) & 0xff, ret);
 367         sib.sib_reason = SIB_HELPER_POST;
 368         sib.helper_exit_code = ret;
 369         drbd_bcast_event(device, &sib);
 370
 371         if (current == connection->worker.task)
 372                 clear_bit(CALLBACK_PENDING, &connection->flags);
 373
 374         if (ret < 0) /* Ignore any ERRNOs we got. */
 375                 ret = 0;
 376
 377         return ret;
 378 }
 379
 380 static int conn_khelper(struct drbd_connection *connection, char *cmd)
 381 {
 382         char *envp[] = { "HOME=/",
 383                         "TERM=linux",
 384                         "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 385                          (char[20]) { }, /* address family */
 386                          (char[60]) { }, /* address */
 387                         NULL };
 388         char *resource_name = connection->resource->name;
 389         char *argv[] = {usermode_helper, cmd, resource_name, NULL };
 390         int ret;
 391
 392         setup_khelper_env(connection, envp);
 393         conn_md_sync(connection);
 394
 395         conn_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
 396         /* TODO: conn_bcast_event() ?? */
 397
 398         ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 399         if (ret)
 400                 conn_warn(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
 401                           usermode_helper, cmd, resource_name,
 402                           (ret >> 8) & 0xff, ret);
 403         else
 404                 conn_info(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
 405                           usermode_helper, cmd, resource_name,
 406                           (ret >> 8) & 0xff, ret);
 407         /* TODO: conn_bcast_event() ?? */
 408
 409         if (ret < 0) /* Ignore any ERRNOs we got. */
 410                 ret = 0;
 411
 412         return ret;
 413 }
 414
 415 static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connection)
 416 {
 417         enum drbd_fencing_p fp = FP_NOT_AVAIL;
 418         struct drbd_device *device;
 419         int vnr;
 420
 421         rcu_read_lock();
 422         idr_for_each_entry(&connection->volumes, device, vnr) {
 423                 if (get_ldev_if_state(device, D_CONSISTENT)) {
 424                         fp = max_t(enum drbd_fencing_p, fp,
 425                                    rcu_dereference(device->ldev->disk_conf)->fencing);
 426                         put_ldev(device);
 427                 }
 428         }
 429         rcu_read_unlock();
 430
 431         return fp;
 432 }
 433
 434 bool conn_try_outdate_peer(struct drbd_connection *connection)
 435 {
 436         unsigned int connect_cnt;
 437         union drbd_state mask = { };
 438         union drbd_state val = { };
 439         enum drbd_fencing_p fp;
 440         char *ex_to_string;
 441         int r;
 442
 443         if (connection->cstate >= C_WF_REPORT_PARAMS) {
 444                 conn_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
 445                 return false;
 446         }
 447
 448         spin_lock_irq(&connection->req_lock);
 449         connect_cnt = connection->connect_cnt;
 450         spin_unlock_irq(&connection->req_lock);
 451
 452         fp = highest_fencing_policy(connection);
 453         switch (fp) {
 454         case FP_NOT_AVAIL:
 455                 conn_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
 456                 goto out;
 457         case FP_DONT_CARE:
 458                 return true;
 459         default: ;
 460         }
 461
 462         r = conn_khelper(connection, "fence-peer");
 463
 464         switch ((r>>8) & 0xff) {
 465         case 3: /* peer is inconsistent */
 466                 ex_to_string = "peer is inconsistent or worse";
 467                 mask.pdsk = D_MASK;
 468                 val.pdsk = D_INCONSISTENT;
 469                 break;
 470         case 4: /* peer got outdated, or was already outdated */
 471                 ex_to_string = "peer was fenced";
 472                 mask.pdsk = D_MASK;
 473                 val.pdsk = D_OUTDATED;
 474                 break;
 475         case 5: /* peer was down */
 476                 if (conn_highest_disk(connection) == D_UP_TO_DATE) {
 477                         /* we will(have) create(d) a new UUID anyways... */
 478                         ex_to_string = "peer is unreachable, assumed to be dead";
 479                         mask.pdsk = D_MASK;
 480                         val.pdsk = D_OUTDATED;
 481                 } else {
 482                         ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
 483                 }
 484                 break;
 485         case 6: /* Peer is primary, voluntarily outdate myself.
 486                  * This is useful when an unconnected R_SECONDARY is asked to
 487                  * become R_PRIMARY, but finds the other peer being active. */
 488                 ex_to_string = "peer is active";
 489                 conn_warn(connection, "Peer is primary, outdating myself.\n");
 490                 mask.disk = D_MASK;
 491                 val.disk = D_OUTDATED;
 492                 break;
 493         case 7:
 494                 if (fp != FP_STONITH)
 495                         conn_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
 496                 ex_to_string = "peer was stonithed";
 497                 mask.pdsk = D_MASK;
 498                 val.pdsk = D_OUTDATED;
 499                 break;
 500         default:
 501                 /* The script is broken ... */
 502                 conn_err(connection, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
 503                 return false; /* Eventually leave IO frozen */
 504         }
 505
 506         conn_info(connection, "fence-peer helper returned %d (%s)\n",
 507                   (r>>8) & 0xff, ex_to_string);
 508
 509  out:
 510
 511         /* Not using
 512            conn_request_state(connection, mask, val, CS_VERBOSE);
 513            here, because we might were able to re-establish the connection in the
 514            meantime. */
 515         spin_lock_irq(&connection->req_lock);
 516         if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
 517                 if (connection->connect_cnt != connect_cnt)
 518                         /* In case the connection was established and droped
 519                            while the fence-peer handler was running, ignore it */
 520                         conn_info(connection, "Ignoring fence-peer exit code\n");
 521                 else
 522                         _conn_request_state(connection, mask, val, CS_VERBOSE);
 523         }
 524         spin_unlock_irq(&connection->req_lock);
 525
 526         return conn_highest_pdsk(connection) <= D_OUTDATED;
 527 }
 528
 529 static int _try_outdate_peer_async(void *data)
 530 {
 531         struct drbd_connection *connection = (struct drbd_connection *)data;
 532
 533         conn_try_outdate_peer(connection);
 534
 535         kref_put(&connection->kref, drbd_destroy_connection);
 536         return 0;
 537 }
 538
 539 void conn_try_outdate_peer_async(struct drbd_connection *connection)
 540 {
 541         struct task_struct *opa;
 542
 543         kref_get(&connection->kref);
 544         opa = kthread_run(_try_outdate_peer_async, connection, "drbd_async_h");
 545         if (IS_ERR(opa)) {
 546                 conn_err(connection, "out of mem, failed to invoke fence-peer helper\n");
 547                 kref_put(&connection->kref, drbd_destroy_connection);
 548         }
 549 }
 550
 551 enum drbd_state_rv
 552 drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 553 {
 554         const int max_tries = 4;
 555         enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
 556         struct net_conf *nc;
 557         int try = 0;
 558         int forced = 0;
 559         union drbd_state mask, val;
 560
 561         if (new_role == R_PRIMARY)
 562                 request_ping(first_peer_device(device)->connection); /* Detect a dead peer ASAP */
 563
 564         mutex_lock(device->state_mutex);
 565
 566         mask.i = 0; mask.role = R_MASK;
 567         val.i  = 0; val.role  = new_role;
 568
 569         while (try++ < max_tries) {
 570                 rv = _drbd_request_state(device, mask, val, CS_WAIT_COMPLETE);
 571
 572                 /* in case we first succeeded to outdate,
 573                  * but now suddenly could establish a connection */
 574                 if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
 575                         val.pdsk = 0;
 576                         mask.pdsk = 0;
 577                         continue;
 578                 }
 579
 580                 if (rv == SS_NO_UP_TO_DATE_DISK && force &&
 581                     (device->state.disk < D_UP_TO_DATE &&
 582                      device->state.disk >= D_INCONSISTENT)) {
 583                         mask.disk = D_MASK;
 584                         val.disk  = D_UP_TO_DATE;
 585                         forced = 1;
 586                         continue;
 587                 }
 588
 589                 if (rv == SS_NO_UP_TO_DATE_DISK &&
 590                     device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
 591                         D_ASSERT(device->state.pdsk == D_UNKNOWN);
 592
 593                         if (conn_try_outdate_peer(first_peer_device(device)->connection)) {
 594                                 val.disk = D_UP_TO_DATE;
 595                                 mask.disk = D_MASK;
 596                         }
 597                         continue;
 598                 }
 599
 600                 if (rv == SS_NOTHING_TO_DO)
 601                         goto out;
 602                 if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
 603                         if (!conn_try_outdate_peer(first_peer_device(device)->connection) && force) {
 604                                 dev_warn(DEV, "Forced into split brain situation!\n");
 605                                 mask.pdsk = D_MASK;
 606                                 val.pdsk  = D_OUTDATED;
 607
 608                         }
 609                         continue;
 610                 }
 611                 if (rv == SS_TWO_PRIMARIES) {
 612                         /* Maybe the peer is detected as dead very soon...
 613                            retry at most once more in this case. */
 614                         int timeo;
 615                         rcu_read_lock();
 616                         nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
 617                         timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
 618                         rcu_read_unlock();
 619                         schedule_timeout_interruptible(timeo);
 620                         if (try < max_tries)
 621                                 try = max_tries - 1;
 622                         continue;
 623                 }
 624                 if (rv < SS_SUCCESS) {
 625                         rv = _drbd_request_state(device, mask, val,
 626                                                 CS_VERBOSE + CS_WAIT_COMPLETE);
 627                         if (rv < SS_SUCCESS)
 628                                 goto out;
 629                 }
 630                 break;
 631         }
 632
 633         if (rv < SS_SUCCESS)
 634                 goto out;
 635
 636         if (forced)
 637                 dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
 638
 639         /* Wait until nothing is on the fly :) */
 640         wait_event(device->misc_wait, atomic_read(&device->ap_pending_cnt) == 0);
 641
 642         /* FIXME also wait for all pending P_BARRIER_ACK? */
 643
 644         if (new_role == R_SECONDARY) {
 645                 set_disk_ro(device->vdisk, true);
 646                 if (get_ldev(device)) {
 647                         device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
 648                         put_ldev(device);
 649                 }
 650         } else {
 651                 mutex_lock(&first_peer_device(device)->connection->conf_update);
 652                 nc = first_peer_device(device)->connection->net_conf;
 653                 if (nc)
 654                         nc->discard_my_data = 0; /* without copy; single bit op is atomic */
 655                 mutex_unlock(&first_peer_device(device)->connection->conf_update);
 656
 657                 set_disk_ro(device->vdisk, false);
 658                 if (get_ldev(device)) {
 659                         if (((device->state.conn < C_CONNECTED ||
 660                                device->state.pdsk <= D_FAILED)
 661                               && device->ldev->md.uuid[UI_BITMAP] == 0) || forced)
 662                                 drbd_uuid_new_current(device);
 663
 664                         device->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
 665                         put_ldev(device);
 666                 }
 667         }
 668
 669         /* writeout of activity log covered areas of the bitmap
 670          * to stable storage done in after state change already */
 671
 672         if (device->state.conn >= C_WF_REPORT_PARAMS) {
 673                 /* if this was forced, we should consider sync */
 674                 if (forced)
 675                         drbd_send_uuids(device);
 676                 drbd_send_current_state(device);
 677         }
 678
 679         drbd_md_sync(device);
 680
 681         kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
 682 out:
 683         mutex_unlock(device->state_mutex);
 684         return rv;
 685 }
 686
 687 static const char *from_attrs_err_to_txt(int err)
 688 {
 689         return  err == -ENOMSG ? "required attribute missing" :
 690                 err == -EOPNOTSUPP ? "unknown mandatory attribute" :
 691                 err == -EEXIST ? "can not change invariant setting" :
 692                 "invalid attribute value";
 693 }
 694
 695 int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
 696 {
 697         struct set_role_parms parms;
 698         int err;
 699         enum drbd_ret_code retcode;
 700
 701         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
 702         if (!adm_ctx.reply_skb)
 703                 return retcode;
 704         if (retcode != NO_ERROR)
 705                 goto out;
 706
 707         memset(&parms, 0, sizeof(parms));
 708         if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
 709                 err = set_role_parms_from_attrs(&parms, info);
 710                 if (err) {
 711                         retcode = ERR_MANDATORY_TAG;
 712                         drbd_msg_put_info(from_attrs_err_to_txt(err));
 713                         goto out;
 714                 }
 715         }
 716
 717         if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
 718                 retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate);
 719         else
 720                 retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0);
 721 out:
 722         drbd_adm_finish(info, retcode);
 723         return 0;
 724 }
 725
 726 /* Initializes the md.*_offset members, so we are able to find
 727  * the on disk meta data.
 728  *
 729  * We currently have two possible layouts:
 730  * external:
 731  *   |----------- md_size_sect ------------------|
 732  *   [ 4k superblock ][ activity log ][  Bitmap  ]
 733  *   | al_offset == 8 |
 734  *   | bm_offset = al_offset + X      |
 735  *  ==> bitmap sectors = md_size_sect - bm_offset
 736  *
 737  * internal:
 738  *            |----------- md_size_sect ------------------|
 739  * [data.....][  Bitmap  ][ activity log ][ 4k superblock ]
 740  *                        | al_offset < 0 |
 741  *            | bm_offset = al_offset - Y |
 742  *  ==> bitmap sectors = Y = al_offset - bm_offset
 743  *
 744  *  Activity log size used to be fixed 32kB,
 745  *  but is about to become configurable.
 746  */
 747 static void drbd_md_set_sector_offsets(struct drbd_device *device,
 748                                        struct drbd_backing_dev *bdev)
 749 {
 750         sector_t md_size_sect = 0;
 751         unsigned int al_size_sect = bdev->md.al_size_4k * 8;
 752
 753         bdev->md.md_offset = drbd_md_ss(bdev);
 754
 755         switch (bdev->md.meta_dev_idx) {
 756         default:
 757                 /* v07 style fixed size indexed meta data */
 758                 bdev->md.md_size_sect = MD_128MB_SECT;
 759                 bdev->md.al_offset = MD_4kB_SECT;
 760                 bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
 761                 break;
 762         case DRBD_MD_INDEX_FLEX_EXT:
 763                 /* just occupy the full device; unit: sectors */
 764                 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
 765                 bdev->md.al_offset = MD_4kB_SECT;
 766                 bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
 767                 break;
 768         case DRBD_MD_INDEX_INTERNAL:
 769         case DRBD_MD_INDEX_FLEX_INT:
 770                 /* al size is still fixed */
 771                 bdev->md.al_offset = -al_size_sect;
 772                 /* we need (slightly less than) ~ this much bitmap sectors: */
 773                 md_size_sect = drbd_get_capacity(bdev->backing_bdev);
 774                 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
 775                 md_size_sect = BM_SECT_TO_EXT(md_size_sect);
 776                 md_size_sect = ALIGN(md_size_sect, 8);
 777
 778                 /* plus the "drbd meta data super block",
 779                  * and the activity log; */
 780                 md_size_sect += MD_4kB_SECT + al_size_sect;
 781
 782                 bdev->md.md_size_sect = md_size_sect;
 783                 /* bitmap offset is adjusted by 'super' block size */
 784                 bdev->md.bm_offset   = -md_size_sect + MD_4kB_SECT;
 785                 break;
 786         }
 787 }
 788
 789 /* input size is expected to be in KB */
 790 char *ppsize(char *buf, unsigned long long size)
 791 {
 792         /* Needs 9 bytes at max including trailing NUL:
 793          * -1ULL ==> "16384 EB" */
 794         static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
 795         int base = 0;
 796         while (size >= 10000 && base < sizeof(units)-1) {
 797                 /* shift + round */
 798                 size = (size >> 10) + !!(size & (1<<9));
 799                 base++;
 800         }
 801         sprintf(buf, "%u %cB", (unsigned)size, units[base]);
 802
 803         return buf;
 804 }
 805
 806 /* there is still a theoretical deadlock when called from receiver
 807  * on an D_INCONSISTENT R_PRIMARY:
 808  *  remote READ does inc_ap_bio, receiver would need to receive answer
 809  *  packet from remote to dec_ap_bio again.
 810  *  receiver receive_sizes(), comes here,
 811  *  waits for ap_bio_cnt == 0. -> deadlock.
 812  * but this cannot happen, actually, because:
 813  *  R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
 814  *  (not connected, or bad/no disk on peer):
 815  *  see drbd_fail_request_early, ap_bio_cnt is zero.
 816  *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
 817  *  peer may not initiate a resize.
 818  */
 819 /* Note these are not to be confused with
 820  * drbd_adm_suspend_io/drbd_adm_resume_io,
 821  * which are (sub) state changes triggered by admin (drbdsetup),
 822  * and can be long lived.
 823  * This changes an device->flag, is triggered by drbd internals,
 824  * and should be short-lived. */
 825 void drbd_suspend_io(struct drbd_device *device)
 826 {
 827         set_bit(SUSPEND_IO, &device->flags);
 828         if (drbd_suspended(device))
 829                 return;
 830         wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
 831 }
 832
 833 void drbd_resume_io(struct drbd_device *device)
 834 {
 835         clear_bit(SUSPEND_IO, &device->flags);
 836         wake_up(&device->misc_wait);
 837 }
 838
 839 /**
 840  * drbd_determine_dev_size() -  Sets the right device size obeying all constraints
 841  * @device:     DRBD device.
 842  *
 843  * Returns 0 on success, negative return values indicate errors.
 844  * You should call drbd_md_sync() after calling this function.
 845  */
 846 enum determine_dev_size
 847 drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
 848 {
 849         sector_t prev_first_sect, prev_size; /* previous meta location */
 850         sector_t la_size_sect, u_size;
 851         struct drbd_md *md = &device->ldev->md;
 852         u32 prev_al_stripe_size_4k;
 853         u32 prev_al_stripes;
 854         sector_t size;
 855         char ppb[10];
 856         void *buffer;
 857
 858         int md_moved, la_size_changed;
 859         enum determine_dev_size rv = DS_UNCHANGED;
 860
 861         /* race:
 862          * application request passes inc_ap_bio,
 863          * but then cannot get an AL-reference.
 864          * this function later may wait on ap_bio_cnt == 0. -> deadlock.
 865          *
 866          * to avoid that:
 867          * Suspend IO right here.
 868          * still lock the act_log to not trigger ASSERTs there.
 869          */
 870         drbd_suspend_io(device);
 871         buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */
 872         if (!buffer) {
 873                 drbd_resume_io(device);
 874                 return DS_ERROR;
 875         }
 876
 877         /* no wait necessary anymore, actually we could assert that */
 878         wait_event(device->al_wait, lc_try_lock(device->act_log));
 879
 880         prev_first_sect = drbd_md_first_sector(device->ldev);
 881         prev_size = device->ldev->md.md_size_sect;
 882         la_size_sect = device->ldev->md.la_size_sect;
 883
 884         if (rs) {
 885                 /* rs is non NULL if we should change the AL layout only */
 886
 887                 prev_al_stripes = md->al_stripes;
 888                 prev_al_stripe_size_4k = md->al_stripe_size_4k;
 889
 890                 md->al_stripes = rs->al_stripes;
 891                 md->al_stripe_size_4k = rs->al_stripe_size / 4;
 892                 md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
 893         }
 894
 895         drbd_md_set_sector_offsets(device, device->ldev);
 896
 897         rcu_read_lock();
 898         u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
 899         rcu_read_unlock();
 900         size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
 901
 902         if (size < la_size_sect) {
 903                 if (rs && u_size == 0) {
 904                         /* Remove "rs &&" later. This check should always be active, but
 905                            right now the receiver expects the permissive behavior */
 906                         dev_warn(DEV, "Implicit shrink not allowed. "
 907                                  "Use --size=%llus for explicit shrink.\n",
 908                                  (unsigned long long)size);
 909                         rv = DS_ERROR_SHRINK;
 910                 }
 911                 if (u_size > size)
 912                         rv = DS_ERROR_SPACE_MD;
 913                 if (rv != DS_UNCHANGED)
 914                         goto err_out;
 915         }
 916
 917         if (drbd_get_capacity(device->this_bdev) != size ||
 918             drbd_bm_capacity(device) != size) {
 919                 int err;
 920                 err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
 921                 if (unlikely(err)) {
 922                         /* currently there is only one error: ENOMEM! */
 923                         size = drbd_bm_capacity(device)>>1;
 924                         if (size == 0) {
 925                                 dev_err(DEV, "OUT OF MEMORY! "
 926                                     "Could not allocate bitmap!\n");
 927                         } else {
 928                                 dev_err(DEV, "BM resizing failed. "
 929                                     "Leaving size unchanged at size = %lu KB\n",
 930                                     (unsigned long)size);
 931                         }
 932                         rv = DS_ERROR;
 933                 }
 934                 /* racy, see comments above. */
 935                 drbd_set_my_capacity(device, size);
 936                 device->ldev->md.la_size_sect = size;
 937                 dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
 938                      (unsigned long long)size>>1);
 939         }
 940         if (rv <= DS_ERROR)
 941                 goto err_out;
 942
 943         la_size_changed = (la_size_sect != device->ldev->md.la_size_sect);
 944
 945         md_moved = prev_first_sect != drbd_md_first_sector(device->ldev)
 946                 || prev_size       != device->ldev->md.md_size_sect;
 947
 948         if (la_size_changed || md_moved || rs) {
 949                 u32 prev_flags;
 950
 951                 drbd_al_shrink(device); /* All extents inactive. */
 952
 953                 prev_flags = md->flags;
 954                 md->flags &= ~MDF_PRIMARY_IND;
 955                 drbd_md_write(device, buffer);
 956
 957                 dev_info(DEV, "Writing the whole bitmap, %s\n",
 958                          la_size_changed && md_moved ? "size changed and md moved" :
 959                          la_size_changed ? "size changed" : "md moved");
 960                 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
 961                 drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
 962                                "size changed", BM_LOCKED_MASK);
 963                 drbd_initialize_al(device, buffer);
 964
 965                 md->flags = prev_flags;
 966                 drbd_md_write(device, buffer);
 967
 968                 if (rs)
 969                         dev_info(DEV, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
 970                                  md->al_stripes, md->al_stripe_size_4k * 4);
 971         }
 972
 973         if (size > la_size_sect)
 974                 rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
 975         if (size < la_size_sect)
 976                 rv = DS_SHRUNK;
 977
 978         if (0) {
 979         err_out:
 980                 if (rs) {
 981                         md->al_stripes = prev_al_stripes;
 982                         md->al_stripe_size_4k = prev_al_stripe_size_4k;
 983                         md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
 984
 985                         drbd_md_set_sector_offsets(device, device->ldev);
 986                 }
 987         }
 988         lc_unlock(device->act_log);
 989         wake_up(&device->al_wait);
 990         drbd_md_put_buffer(device);
 991         drbd_resume_io(device);
 992
 993         return rv;
 994 }
 995
 996 sector_t
 997 drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev,
 998                   sector_t u_size, int assume_peer_has_space)
 999 {
1000         sector_t p_size = device->p_size;   /* partner's disk size. */
1001         sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
1002         sector_t m_size; /* my size */
1003         sector_t size = 0;
1004
1005         m_size = drbd_get_max_capacity(bdev);
1006
1007         if (device->state.conn < C_CONNECTED && assume_peer_has_space) {
1008                 dev_warn(DEV, "Resize while not connected was forced by the user!\n");
1009                 p_size = m_size;
1010         }
1011
1012         if (p_size && m_size) {
1013                 size = min_t(sector_t, p_size, m_size);
1014         } else {
1015                 if (la_size_sect) {
1016                         size = la_size_sect;
1017                         if (m_size && m_size < size)
1018                                 size = m_size;
1019                         if (p_size && p_size < size)
1020                                 size = p_size;
1021                 } else {
1022                         if (m_size)
1023                                 size = m_size;
1024                         if (p_size)
1025                                 size = p_size;
1026                 }
1027         }
1028
1029         if (size == 0)
1030                 dev_err(DEV, "Both nodes diskless!\n");
1031
1032         if (u_size) {
1033                 if (u_size > size)
1034                         dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
1035                             (unsigned long)u_size>>1, (unsigned long)size>>1);
1036                 else
1037                         size = u_size;
1038         }
1039
1040         return size;
1041 }
1042
1043 /**
1044  * drbd_check_al_size() - Ensures that the AL is of the right size
1045  * @device:     DRBD device.
1046  *
1047  * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
1048  * failed, and 0 on success. You should call drbd_md_sync() after you called
1049  * this function.
1050  */
1051 static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
1052 {
1053         struct lru_cache *n, *t;
1054         struct lc_element *e;
1055         unsigned int in_use;
1056         int i;
1057
1058         if (device->act_log &&
1059             device->act_log->nr_elements == dc->al_extents)
1060                 return 0;
1061
1062         in_use = 0;
1063         t = device->act_log;
1064         n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
1065                 dc->al_extents, sizeof(struct lc_element), 0);
1066
1067         if (n == NULL) {
1068                 dev_err(DEV, "Cannot allocate act_log lru!\n");
1069                 return -ENOMEM;
1070         }
1071         spin_lock_irq(&device->al_lock);
1072         if (t) {
1073                 for (i = 0; i < t->nr_elements; i++) {
1074                         e = lc_element_by_index(t, i);
1075                         if (e->refcnt)
1076                                 dev_err(DEV, "refcnt(%d)==%d\n",
1077                                     e->lc_number, e->refcnt);
1078                         in_use += e->refcnt;
1079                 }
1080         }
1081         if (!in_use)
1082                 device->act_log = n;
1083         spin_unlock_irq(&device->al_lock);
1084         if (in_use) {
1085                 dev_err(DEV, "Activity log still in use!\n");
1086                 lc_destroy(n);
1087                 return -EBUSY;
1088         } else {
1089                 if (t)
1090                         lc_destroy(t);
1091         }
1092         drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
1093         return 0;
1094 }
1095
1096 static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_bio_size)
1097 {
1098         struct request_queue * const q = device->rq_queue;
1099         unsigned int max_hw_sectors = max_bio_size >> 9;
1100         unsigned int max_segments = 0;
1101
1102         if (get_ldev_if_state(device, D_ATTACHING)) {
1103                 struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue;
1104
1105                 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
1106                 rcu_read_lock();
1107                 max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
1108                 rcu_read_unlock();
1109                 put_ldev(device);
1110         }
1111
1112         blk_queue_logical_block_size(q, 512);
1113         blk_queue_max_hw_sectors(q, max_hw_sectors);
1114         /* This is the workaround for "bio would need to, but cannot, be split" */
1115         blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
1116         blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
1117
1118         if (get_ldev_if_state(device, D_ATTACHING)) {
1119                 struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue;
1120
1121                 blk_queue_stack_limits(q, b);
1122
1123                 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
1124                         dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
1125                                  q->backing_dev_info.ra_pages,
1126                                  b->backing_dev_info.ra_pages);
1127                         q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
1128                 }
1129                 put_ldev(device);
1130         }
1131 }
1132
1133 void drbd_reconsider_max_bio_size(struct drbd_device *device)
1134 {
1135         unsigned int now, new, local, peer;
1136
1137         now = queue_max_hw_sectors(device->rq_queue) << 9;
1138         local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
1139         peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
1140
1141         if (get_ldev_if_state(device, D_ATTACHING)) {
1142                 local = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
1143                 device->local_max_bio_size = local;
1144                 put_ldev(device);
1145         }
1146         local = min(local, DRBD_MAX_BIO_SIZE);
1147
1148         /* We may ignore peer limits if the peer is modern enough.
1149            Because new from 8.3.8 onwards the peer can use multiple
1150            BIOs for a single peer_request */
1151         if (device->state.conn >= C_WF_REPORT_PARAMS) {
1152                 if (first_peer_device(device)->connection->agreed_pro_version < 94)
1153                         peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
1154                         /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
1155                 else if (first_peer_device(device)->connection->agreed_pro_version == 94)
1156                         peer = DRBD_MAX_SIZE_H80_PACKET;
1157                 else if (first_peer_device(device)->connection->agreed_pro_version < 100)
1158                         peer = DRBD_MAX_BIO_SIZE_P95;  /* drbd 8.3.8 onwards, before 8.4.0 */
1159                 else
1160                         peer = DRBD_MAX_BIO_SIZE;
1161         }
1162
1163         new = min(local, peer);
1164
1165         if (device->state.role == R_PRIMARY && new < now)
1166                 dev_err(DEV, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
1167
1168         if (new != now)
1169                 dev_info(DEV, "max BIO size = %u\n", new);
1170
1171         drbd_setup_queue_param(device, new);
1172 }
1173
1174 /* Starts the worker thread */
1175 static void conn_reconfig_start(struct drbd_connection *connection)
1176 {
1177         drbd_thread_start(&connection->worker);
1178         conn_flush_workqueue(connection);
1179 }
1180
1181 /* if still unconfigured, stops worker again. */
1182 static void conn_reconfig_done(struct drbd_connection *connection)
1183 {
1184         bool stop_threads;
1185         spin_lock_irq(&connection->req_lock);
1186         stop_threads = conn_all_vols_unconf(connection) &&
1187                 connection->cstate == C_STANDALONE;
1188         spin_unlock_irq(&connection->req_lock);
1189         if (stop_threads) {
1190                 /* asender is implicitly stopped by receiver
1191                  * in conn_disconnect() */
1192                 drbd_thread_stop(&connection->receiver);
1193                 drbd_thread_stop(&connection->worker);
1194         }
1195 }
1196
1197 /* Make sure IO is suspended before calling this function(). */
1198 static void drbd_suspend_al(struct drbd_device *device)
1199 {
1200         int s = 0;
1201
1202         if (!lc_try_lock(device->act_log)) {
1203                 dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
1204                 return;
1205         }
1206
1207         drbd_al_shrink(device);
1208         spin_lock_irq(&first_peer_device(device)->connection->req_lock);
1209         if (device->state.conn < C_CONNECTED)
1210                 s = !test_and_set_bit(AL_SUSPENDED, &device->flags);
1211         spin_unlock_irq(&first_peer_device(device)->connection->req_lock);
1212         lc_unlock(device->act_log);
1213
1214         if (s)
1215                 dev_info(DEV, "Suspended AL updates\n");
1216 }
1217
1218
1219 static bool should_set_defaults(struct genl_info *info)
1220 {
1221         unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
1222         return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
1223 }
1224
1225 static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
1226 {
1227         /* This is limited by 16 bit "slot" numbers,
1228          * and by available on-disk context storage.
1229          *
1230          * Also (u16)~0 is special (denotes a "free" extent).
1231          *
1232          * One transaction occupies one 4kB on-disk block,
1233          * we have n such blocks in the on disk ring buffer,
1234          * the "current" transaction may fail (n-1),
1235          * and there is 919 slot numbers context information per transaction.
1236          *
1237          * 72 transaction blocks amounts to more than 2**16 context slots,
1238          * so cap there first.
1239          */
1240         const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
1241         const unsigned int sufficient_on_disk =
1242                 (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
1243                 /AL_CONTEXT_PER_TRANSACTION;
1244
1245         unsigned int al_size_4k = bdev->md.al_size_4k;
1246
1247         if (al_size_4k > sufficient_on_disk)
1248                 return max_al_nr;
1249
1250         return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
1251 }
1252
1253 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1254 {
1255         enum drbd_ret_code retcode;
1256         struct drbd_device *device;
1257         struct disk_conf *new_disk_conf, *old_disk_conf;
1258         struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
1259         int err, fifo_size;
1260
1261         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
1262         if (!adm_ctx.reply_skb)
1263                 return retcode;
1264         if (retcode != NO_ERROR)
1265                 goto out;
1266
1267         device = adm_ctx.device;
1268
1269         /* we also need a disk
1270          * to change the options on */
1271         if (!get_ldev(device)) {
1272                 retcode = ERR_NO_DISK;
1273                 goto out;
1274         }
1275
1276         new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
1277         if (!new_disk_conf) {
1278                 retcode = ERR_NOMEM;
1279                 goto fail;
1280         }
1281
1282         mutex_lock(&first_peer_device(device)->connection->conf_update);
1283         old_disk_conf = device->ldev->disk_conf;
1284         *new_disk_conf = *old_disk_conf;
1285         if (should_set_defaults(info))
1286                 set_disk_conf_defaults(new_disk_conf);
1287
1288         err = disk_conf_from_attrs_for_change(new_disk_conf, info);
1289         if (err && err != -ENOMSG) {
1290                 retcode = ERR_MANDATORY_TAG;
1291                 drbd_msg_put_info(from_attrs_err_to_txt(err));
1292                 goto fail_unlock;
1293         }
1294
1295         if (!expect(new_disk_conf->resync_rate >= 1))
1296                 new_disk_conf->resync_rate = 1;
1297
1298         if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1299                 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1300         if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev))
1301                 new_disk_conf->al_extents = drbd_al_extents_max(device->ldev);
1302
1303         if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1304                 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1305
1306         fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
1307         if (fifo_size != device->rs_plan_s->size) {
1308                 new_plan = fifo_alloc(fifo_size);
1309                 if (!new_plan) {
1310                         dev_err(DEV, "kmalloc of fifo_buffer failed");
1311                         retcode = ERR_NOMEM;
1312                         goto fail_unlock;
1313                 }
1314         }
1315
1316         drbd_suspend_io(device);
1317         wait_event(device->al_wait, lc_try_lock(device->act_log));
1318         drbd_al_shrink(device);
1319         err = drbd_check_al_size(device, new_disk_conf);
1320         lc_unlock(device->act_log);
1321         wake_up(&device->al_wait);
1322         drbd_resume_io(device);
1323
1324         if (err) {
1325                 retcode = ERR_NOMEM;
1326                 goto fail_unlock;
1327         }
1328
1329         write_lock_irq(&global_state_lock);
1330         retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
1331         if (retcode == NO_ERROR) {
1332                 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
1333                 drbd_resync_after_changed(device);
1334         }
1335         write_unlock_irq(&global_state_lock);
1336
1337         if (retcode != NO_ERROR)
1338                 goto fail_unlock;
1339
1340         if (new_plan) {
1341                 old_plan = device->rs_plan_s;
1342                 rcu_assign_pointer(device->rs_plan_s, new_plan);
1343         }
1344
1345         mutex_unlock(&first_peer_device(device)->connection->conf_update);
1346
1347         if (new_disk_conf->al_updates)
1348                 device->ldev->md.flags &= ~MDF_AL_DISABLED;
1349         else
1350                 device->ldev->md.flags |= MDF_AL_DISABLED;
1351
1352         if (new_disk_conf->md_flushes)
1353                 clear_bit(MD_NO_FUA, &device->flags);
1354         else
1355                 set_bit(MD_NO_FUA, &device->flags);
1356
1357         drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
1358
1359         drbd_md_sync(device);
1360
1361         if (device->state.conn >= C_CONNECTED)
1362                 drbd_send_sync_param(device);
1363
1364         synchronize_rcu();
1365         kfree(old_disk_conf);
1366         kfree(old_plan);
1367         mod_timer(&device->request_timer, jiffies + HZ);
1368         goto success;
1369
1370 fail_unlock:
1371         mutex_unlock(&first_peer_device(device)->connection->conf_update);
1372  fail:
1373         kfree(new_disk_conf);
1374         kfree(new_plan);
1375 success:
1376         put_ldev(device);
1377  out:
1378         drbd_adm_finish(info, retcode);
1379         return 0;
1380 }
1381
1382 int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1383 {
1384         struct drbd_device *device;
1385         int err;
1386         enum drbd_ret_code retcode;
1387         enum determine_dev_size dd;
1388         sector_t max_possible_sectors;
1389         sector_t min_md_device_sectors;
1390         struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
1391         struct disk_conf *new_disk_conf = NULL;
1392         struct block_device *bdev;
1393         struct lru_cache *resync_lru = NULL;
1394         struct fifo_buffer *new_plan = NULL;
1395         union drbd_state ns, os;
1396         enum drbd_state_rv rv;
1397         struct net_conf *nc;
1398
1399         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
1400         if (!adm_ctx.reply_skb)
1401                 return retcode;
1402         if (retcode != NO_ERROR)
1403                 goto finish;
1404
1405         device = adm_ctx.device;
1406         conn_reconfig_start(first_peer_device(device)->connection);
1407
1408         /* if you want to reconfigure, please tear down first */
1409         if (device->state.disk > D_DISKLESS) {
1410                 retcode = ERR_DISK_CONFIGURED;
1411                 goto fail;
1412         }
1413         /* It may just now have detached because of IO error.  Make sure
1414          * drbd_ldev_destroy is done already, we may end up here very fast,
1415          * e.g. if someone calls attach from the on-io-error handler,
1416          * to realize a "hot spare" feature (not that I'd recommend that) */
1417         wait_event(device->misc_wait, !atomic_read(&device->local_cnt));
1418
1419         /* make sure there is no leftover from previous force-detach attempts */
1420         clear_bit(FORCE_DETACH, &device->flags);
1421         clear_bit(WAS_IO_ERROR, &device->flags);
1422         clear_bit(WAS_READ_ERROR, &device->flags);
1423
1424         /* and no leftover from previously aborted resync or verify, either */
1425         device->rs_total = 0;
1426         device->rs_failed = 0;
1427         atomic_set(&device->rs_pending_cnt, 0);
1428
1429         /* allocation not in the IO path, drbdsetup context */
1430         nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
1431         if (!nbc) {
1432                 retcode = ERR_NOMEM;
1433                 goto fail;
1434         }
1435         spin_lock_init(&nbc->md.uuid_lock);
1436
1437         new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
1438         if (!new_disk_conf) {
1439                 retcode = ERR_NOMEM;
1440                 goto fail;
1441         }
1442         nbc->disk_conf = new_disk_conf;
1443
1444         set_disk_conf_defaults(new_disk_conf);
1445         err = disk_conf_from_attrs(new_disk_conf, info);
1446         if (err) {
1447                 retcode = ERR_MANDATORY_TAG;
1448                 drbd_msg_put_info(from_attrs_err_to_txt(err));
1449                 goto fail;
1450         }
1451
1452         if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1453                 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1454
1455         new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
1456         if (!new_plan) {
1457                 retcode = ERR_NOMEM;
1458                 goto fail;
1459         }
1460
1461         if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
1462                 retcode = ERR_MD_IDX_INVALID;
1463                 goto fail;
1464         }
1465
1466         write_lock_irq(&global_state_lock);
1467         retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
1468         write_unlock_irq(&global_state_lock);
1469         if (retcode != NO_ERROR)
1470                 goto fail;
1471
1472         rcu_read_lock();
1473         nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
1474         if (nc) {
1475                 if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
1476                         rcu_read_unlock();
1477                         retcode = ERR_STONITH_AND_PROT_A;
1478                         goto fail;
1479                 }
1480         }
1481         rcu_read_unlock();
1482
1483         bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
1484                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL, device);
1485         if (IS_ERR(bdev)) {
1486                 dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
1487                         PTR_ERR(bdev));
1488                 retcode = ERR_OPEN_DISK;
1489                 goto fail;
1490         }
1491         nbc->backing_bdev = bdev;
1492
1493         /*
1494          * meta_dev_idx >= 0: external fixed size, possibly multiple
1495          * drbd sharing one meta device.  TODO in that case, paranoia
1496          * check that [md_bdev, meta_dev_idx] is not yet used by some
1497          * other drbd minor!  (if you use drbd.conf + drbdadm, that
1498          * should check it for you already; but if you don't, or
1499          * someone fooled it, we need to double check here)
1500          */
1501         bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
1502                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1503                                   (new_disk_conf->meta_dev_idx < 0) ?
1504                                   (void *)device : (void *)drbd_m_holder);
1505         if (IS_ERR(bdev)) {
1506                 dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
1507                         PTR_ERR(bdev));
1508                 retcode = ERR_OPEN_MD_DISK;
1509                 goto fail;
1510         }
1511         nbc->md_bdev = bdev;
1512
1513         if ((nbc->backing_bdev == nbc->md_bdev) !=
1514             (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
1515              new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
1516                 retcode = ERR_MD_IDX_INVALID;
1517                 goto fail;
1518         }
1519
1520         resync_lru = lc_create("resync", drbd_bm_ext_cache,
1521                         1, 61, sizeof(struct bm_extent),
1522                         offsetof(struct bm_extent, lce));
1523         if (!resync_lru) {
1524                 retcode = ERR_NOMEM;
1525                 goto fail;
1526         }
1527
1528         /* Read our meta data super block early.
1529          * This also sets other on-disk offsets. */
1530         retcode = drbd_md_read(device, nbc);
1531         if (retcode != NO_ERROR)
1532                 goto fail;
1533
1534         if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1535                 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1536         if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
1537                 new_disk_conf->al_extents = drbd_al_extents_max(nbc);
1538
1539         if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
1540                 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
1541                         (unsigned long long) drbd_get_max_capacity(nbc),
1542                         (unsigned long long) new_disk_conf->disk_size);
1543                 retcode = ERR_DISK_TOO_SMALL;
1544                 goto fail;
1545         }
1546
1547         if (new_disk_conf->meta_dev_idx < 0) {
1548                 max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
1549                 /* at least one MB, otherwise it does not make sense */
1550                 min_md_device_sectors = (2<<10);
1551         } else {
1552                 max_possible_sectors = DRBD_MAX_SECTORS;
1553                 min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
1554         }
1555
1556         if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
1557                 retcode = ERR_MD_DISK_TOO_SMALL;
1558                 dev_warn(DEV, "refusing attach: md-device too small, "
1559                      "at least %llu sectors needed for this meta-disk type\n",
1560                      (unsigned long long) min_md_device_sectors);
1561                 goto fail;
1562         }
1563
1564         /* Make sure the new disk is big enough
1565          * (we may currently be R_PRIMARY with no local disk...) */
1566         if (drbd_get_max_capacity(nbc) <
1567             drbd_get_capacity(device->this_bdev)) {
1568                 retcode = ERR_DISK_TOO_SMALL;
1569                 goto fail;
1570         }
1571
1572         nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
1573
1574         if (nbc->known_size > max_possible_sectors) {
1575                 dev_warn(DEV, "==> truncating very big lower level device "
1576                         "to currently maximum possible %llu sectors <==\n",
1577                         (unsigned long long) max_possible_sectors);
1578                 if (new_disk_conf->meta_dev_idx >= 0)
1579                         dev_warn(DEV, "==>> using internal or flexible "
1580                                       "meta data may help <<==\n");
1581         }
1582
1583         drbd_suspend_io(device);
1584         /* also wait for the last barrier ack. */
1585         /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
1586          * We need a way to either ignore barrier acks for barriers sent before a device
1587          * was attached, or a way to wait for all pending barrier acks to come in.
1588          * As barriers are counted per resource,
1589          * we'd need to suspend io on all devices of a resource.
1590          */
1591         wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
1592         /* and for any other previously queued work */
1593         drbd_flush_workqueue(device);
1594
1595         rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
1596         retcode = rv;  /* FIXME: Type mismatch. */
1597         drbd_resume_io(device);
1598         if (rv < SS_SUCCESS)
1599                 goto fail;
1600
1601         if (!get_ldev_if_state(device, D_ATTACHING))
1602                 goto force_diskless;
1603
1604         if (!device->bitmap) {
1605                 if (drbd_bm_init(device)) {
1606                         retcode = ERR_NOMEM;
1607                         goto force_diskless_dec;
1608                 }
1609         }
1610
1611         if (device->state.conn < C_CONNECTED &&
1612             device->state.role == R_PRIMARY &&
1613             (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
1614                 dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
1615                     (unsigned long long)device->ed_uuid);
1616                 retcode = ERR_DATA_NOT_CURRENT;
1617                 goto force_diskless_dec;
1618         }
1619
1620         /* Since we are diskless, fix the activity log first... */
1621         if (drbd_check_al_size(device, new_disk_conf)) {
1622                 retcode = ERR_NOMEM;
1623                 goto force_diskless_dec;
1624         }
1625
1626         /* Prevent shrinking of consistent devices ! */
1627         if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
1628             drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
1629                 dev_warn(DEV, "refusing to truncate a consistent device\n");
1630                 retcode = ERR_DISK_TOO_SMALL;
1631                 goto force_diskless_dec;
1632         }
1633
1634         /* Reset the "barriers don't work" bits here, then force meta data to
1635          * be written, to ensure we determine if barriers are supported. */
1636         if (new_disk_conf->md_flushes)
1637                 clear_bit(MD_NO_FUA, &device->flags);
1638         else
1639                 set_bit(MD_NO_FUA, &device->flags);
1640
1641         /* Point of no return reached.
1642          * Devices and memory are no longer released by error cleanup below.
1643          * now device takes over responsibility, and the state engine should
1644          * clean it up somewhere.  */
1645         D_ASSERT(device->ldev == NULL);
1646         device->ldev = nbc;
1647         device->resync = resync_lru;
1648         device->rs_plan_s = new_plan;
1649         nbc = NULL;
1650         resync_lru = NULL;
1651         new_disk_conf = NULL;
1652         new_plan = NULL;
1653
1654         drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
1655
1656         if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
1657                 set_bit(CRASHED_PRIMARY, &device->flags);
1658         else
1659                 clear_bit(CRASHED_PRIMARY, &device->flags);
1660
1661         if (drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
1662             !(device->state.role == R_PRIMARY &&
1663               first_peer_device(device)->connection->susp_nod))
1664                 set_bit(CRASHED_PRIMARY, &device->flags);
1665
1666         device->send_cnt = 0;
1667         device->recv_cnt = 0;
1668         device->read_cnt = 0;
1669         device->writ_cnt = 0;
1670
1671         drbd_reconsider_max_bio_size(device);
1672
1673         /* If I am currently not R_PRIMARY,
1674          * but meta data primary indicator is set,
1675          * I just now recover from a hard crash,
1676          * and have been R_PRIMARY before that crash.
1677          *
1678          * Now, if I had no connection before that crash
1679          * (have been degraded R_PRIMARY), chances are that
1680          * I won't find my peer now either.
1681          *
1682          * In that case, and _only_ in that case,
1683          * we use the degr-wfc-timeout instead of the default,
1684          * so we can automatically recover from a crash of a
1685          * degraded but active "cluster" after a certain timeout.
1686          */
1687         clear_bit(USE_DEGR_WFC_T, &device->flags);
1688         if (device->state.role != R_PRIMARY &&
1689              drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
1690             !drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND))
1691                 set_bit(USE_DEGR_WFC_T, &device->flags);
1692
1693         dd = drbd_determine_dev_size(device, 0, NULL);
1694         if (dd <= DS_ERROR) {
1695                 retcode = ERR_NOMEM_BITMAP;
1696                 goto force_diskless_dec;
1697         } else if (dd == DS_GREW)
1698                 set_bit(RESYNC_AFTER_NEG, &device->flags);
1699
1700         if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ||
1701             (test_bit(CRASHED_PRIMARY, &device->flags) &&
1702              drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) {
1703                 dev_info(DEV, "Assuming that all blocks are out of sync "
1704                      "(aka FullSync)\n");
1705                 if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
1706                         "set_n_write from attaching", BM_LOCKED_MASK)) {
1707                         retcode = ERR_IO_MD_DISK;
1708                         goto force_diskless_dec;
1709                 }
1710         } else {
1711                 if (drbd_bitmap_io(device, &drbd_bm_read,
1712                         "read from attaching", BM_LOCKED_MASK)) {
1713                         retcode = ERR_IO_MD_DISK;
1714                         goto force_diskless_dec;
1715                 }
1716         }
1717
1718         if (_drbd_bm_total_weight(device) == drbd_bm_bits(device))
1719                 drbd_suspend_al(device); /* IO is still suspended here... */
1720
1721         spin_lock_irq(&first_peer_device(device)->connection->req_lock);
1722         os = drbd_read_state(device);
1723         ns = os;
1724         /* If MDF_CONSISTENT is not set go into inconsistent state,
1725            otherwise investigate MDF_WasUpToDate...
1726            If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
1727            otherwise into D_CONSISTENT state.
1728         */
1729         if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) {
1730                 if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))
1731                         ns.disk = D_CONSISTENT;
1732                 else
1733                         ns.disk = D_OUTDATED;
1734         } else {
1735                 ns.disk = D_INCONSISTENT;
1736         }
1737
1738         if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED))
1739                 ns.pdsk = D_OUTDATED;
1740
1741         rcu_read_lock();
1742         if (ns.disk == D_CONSISTENT &&
1743             (ns.pdsk == D_OUTDATED || rcu_dereference(device->ldev->disk_conf)->fencing == FP_DONT_CARE))
1744                 ns.disk = D_UP_TO_DATE;
1745
1746         /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
1747            MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
1748            this point, because drbd_request_state() modifies these
1749            flags. */
1750
1751         if (rcu_dereference(device->ldev->disk_conf)->al_updates)
1752                 device->ldev->md.flags &= ~MDF_AL_DISABLED;
1753         else
1754                 device->ldev->md.flags |= MDF_AL_DISABLED;
1755
1756         rcu_read_unlock();
1757
1758         /* In case we are C_CONNECTED postpone any decision on the new disk
1759            state after the negotiation phase. */
1760         if (device->state.conn == C_CONNECTED) {
1761                 device->new_state_tmp.i = ns.i;
1762                 ns.i = os.i;
1763                 ns.disk = D_NEGOTIATING;
1764
1765                 /* We expect to receive up-to-date UUIDs soon.
1766                    To avoid a race in receive_state, free p_uuid while
1767                    holding req_lock. I.e. atomic with the state change */
1768                 kfree(device->p_uuid);
1769                 device->p_uuid = NULL;
1770         }
1771
1772         rv = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
1773         spin_unlock_irq(&first_peer_device(device)->connection->req_lock);
1774
1775         if (rv < SS_SUCCESS)
1776                 goto force_diskless_dec;
1777
1778         mod_timer(&device->request_timer, jiffies + HZ);
1779
1780         if (device->state.role == R_PRIMARY)
1781                 device->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
1782         else
1783                 device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
1784
1785         drbd_md_mark_dirty(device);
1786         drbd_md_sync(device);
1787
1788         kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
1789         put_ldev(device);
1790         conn_reconfig_done(first_peer_device(device)->connection);
1791         drbd_adm_finish(info, retcode);
1792         return 0;
1793
1794  force_diskless_dec:
1795         put_ldev(device);
1796  force_diskless:
1797         drbd_force_state(device, NS(disk, D_DISKLESS));
1798         drbd_md_sync(device);
1799  fail:
1800         conn_reconfig_done(first_peer_device(device)->connection);
1801         if (nbc) {
1802                 if (nbc->backing_bdev)
1803                         blkdev_put(nbc->backing_bdev,
1804                                    FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1805                 if (nbc->md_bdev)
1806                         blkdev_put(nbc->md_bdev,
1807                                    FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1808                 kfree(nbc);
1809         }
1810         kfree(new_disk_conf);
1811         lc_destroy(resync_lru);
1812         kfree(new_plan);
1813
1814  finish:
1815         drbd_adm_finish(info, retcode);
1816         return 0;
1817 }
1818
1819 static int adm_detach(struct drbd_device *device, int force)
1820 {
1821         enum drbd_state_rv retcode;
1822         int ret;
1823
1824         if (force) {
1825                 set_bit(FORCE_DETACH, &device->flags);
1826                 drbd_force_state(device, NS(disk, D_FAILED));
1827                 retcode = SS_SUCCESS;
1828                 goto out;
1829         }
1830
1831         drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
1832         drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */
1833         retcode = drbd_request_state(device, NS(disk, D_FAILED));
1834         drbd_md_put_buffer(device);
1835         /* D_FAILED will transition to DISKLESS. */
1836         ret = wait_event_interruptible(device->misc_wait,
1837                         device->state.disk != D_FAILED);
1838         drbd_resume_io(device);
1839         if ((int)retcode == (int)SS_IS_DISKLESS)
1840                 retcode = SS_NOTHING_TO_DO;
1841         if (ret)
1842                 retcode = ERR_INTR;
1843 out:
1844         return retcode;
1845 }
1846
1847 /* Detaching the disk is a process in multiple stages.  First we need to lock
1848  * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
1849  * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
1850  * internal references as well.
1851  * Only then we have finally detached. */
1852 int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
1853 {
1854         enum drbd_ret_code retcode;
1855         struct detach_parms parms = { };
1856         int err;
1857
1858         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
1859         if (!adm_ctx.reply_skb)
1860                 return retcode;
1861         if (retcode != NO_ERROR)
1862                 goto out;
1863
1864         if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
1865                 err = detach_parms_from_attrs(&parms, info);
1866                 if (err) {
1867                         retcode = ERR_MANDATORY_TAG;
1868                         drbd_msg_put_info(from_attrs_err_to_txt(err));
1869                         goto out;
1870                 }
1871         }
1872
1873         retcode = adm_detach(adm_ctx.device, parms.force_detach);
1874 out:
1875         drbd_adm_finish(info, retcode);
1876         return 0;
1877 }
1878
1879 static bool conn_resync_running(struct drbd_connection *connection)
1880 {
1881         struct drbd_device *device;
1882         bool rv = false;
1883         int vnr;
1884
1885         rcu_read_lock();
1886         idr_for_each_entry(&connection->volumes, device, vnr) {
1887                 if (device->state.conn == C_SYNC_SOURCE ||
1888                     device->state.conn == C_SYNC_TARGET ||
1889                     device->state.conn == C_PAUSED_SYNC_S ||
1890                     device->state.conn == C_PAUSED_SYNC_T) {
1891                         rv = true;
1892                         break;
1893                 }
1894         }
1895         rcu_read_unlock();
1896
1897         return rv;
1898 }
1899
1900 static bool conn_ov_running(struct drbd_connection *connection)
1901 {
1902         struct drbd_device *device;
1903         bool rv = false;
1904         int vnr;
1905
1906         rcu_read_lock();
1907         idr_for_each_entry(&connection->volumes, device, vnr) {
1908                 if (device->state.conn == C_VERIFY_S ||
1909                     device->state.conn == C_VERIFY_T) {
1910                         rv = true;
1911                         break;
1912                 }
1913         }
1914         rcu_read_unlock();
1915
1916         return rv;
1917 }
1918
1919 static enum drbd_ret_code
1920 _check_net_options(struct drbd_connection *connection, struct net_conf *old_conf, struct net_conf *new_conf)
1921 {
1922         struct drbd_device *device;
1923         int i;
1924
1925         if (old_conf && connection->cstate == C_WF_REPORT_PARAMS && connection->agreed_pro_version < 100) {
1926                 if (new_conf->wire_protocol != old_conf->wire_protocol)
1927                         return ERR_NEED_APV_100;
1928
1929                 if (new_conf->two_primaries != old_conf->two_primaries)
1930                         return ERR_NEED_APV_100;
1931
1932                 if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg))
1933                         return ERR_NEED_APV_100;
1934         }
1935
1936         if (!new_conf->two_primaries &&
1937             conn_highest_role(connection) == R_PRIMARY &&
1938             conn_highest_peer(connection) == R_PRIMARY)
1939                 return ERR_NEED_ALLOW_TWO_PRI;
1940
1941         if (new_conf->two_primaries &&
1942             (new_conf->wire_protocol != DRBD_PROT_C))
1943                 return ERR_NOT_PROTO_C;
1944
1945         idr_for_each_entry(&connection->volumes, device, i) {
1946                 if (get_ldev(device)) {
1947                         enum drbd_fencing_p fp = rcu_dereference(device->ldev->disk_conf)->fencing;
1948                         put_ldev(device);
1949                         if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
1950                                 return ERR_STONITH_AND_PROT_A;
1951                 }
1952                 if (device->state.role == R_PRIMARY && new_conf->discard_my_data)
1953                         return ERR_DISCARD_IMPOSSIBLE;
1954         }
1955
1956         if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A)
1957                 return ERR_CONG_NOT_PROTO_A;
1958
1959         return NO_ERROR;
1960 }
1961
1962 static enum drbd_ret_code
1963 check_net_options(struct drbd_connection *connection, struct net_conf *new_conf)
1964 {
1965         static enum drbd_ret_code rv;
1966         struct drbd_device *device;
1967         int i;
1968
1969         rcu_read_lock();
1970         rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_conf);
1971         rcu_read_unlock();
1972
1973         /* connection->volumes protected by genl_lock() here */
1974         idr_for_each_entry(&connection->volumes, device, i) {
1975                 if (!device->bitmap) {
1976                         if (drbd_bm_init(device))
1977                                 return ERR_NOMEM;
1978                 }
1979         }
1980
1981         return rv;
1982 }
1983
1984 struct crypto {
1985         struct crypto_hash *verify_tfm;
1986         struct crypto_hash *csums_tfm;
1987         struct crypto_hash *cram_hmac_tfm;
1988         struct crypto_hash *integrity_tfm;
1989 };
1990
1991 static int
1992 alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
1993 {
1994         if (!tfm_name[0])
1995                 return NO_ERROR;
1996
1997         *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
1998         if (IS_ERR(*tfm)) {
1999                 *tfm = NULL;
2000                 return err_alg;
2001         }
2002
2003         return NO_ERROR;
2004 }
2005
2006 static enum drbd_ret_code
2007 alloc_crypto(struct crypto *crypto, struct net_conf *new_conf)
2008 {
2009         char hmac_name[CRYPTO_MAX_ALG_NAME];
2010         enum drbd_ret_code rv;
2011
2012         rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg,
2013                        ERR_CSUMS_ALG);
2014         if (rv != NO_ERROR)
2015                 return rv;
2016         rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg,
2017                        ERR_VERIFY_ALG);
2018         if (rv != NO_ERROR)
2019                 return rv;
2020         rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg,
2021                        ERR_INTEGRITY_ALG);
2022         if (rv != NO_ERROR)
2023                 return rv;
2024         if (new_conf->cram_hmac_alg[0] != 0) {
2025                 snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
2026                          new_conf->cram_hmac_alg);
2027
2028                 rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
2029                                ERR_AUTH_ALG);
2030         }
2031
2032         return rv;
2033 }
2034
2035 static void free_crypto(struct crypto *crypto)
2036 {
2037         crypto_free_hash(crypto->cram_hmac_tfm);
2038         crypto_free_hash(crypto->integrity_tfm);
2039         crypto_free_hash(crypto->csums_tfm);
2040         crypto_free_hash(crypto->verify_tfm);
2041 }
2042
2043 int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
2044 {
2045         enum drbd_ret_code retcode;
2046         struct drbd_connection *connection;
2047         struct net_conf *old_conf, *new_conf = NULL;
2048         int err;
2049         int ovr; /* online verify running */
2050         int rsr; /* re-sync running */
2051         struct crypto crypto = { };
2052
2053         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
2054         if (!adm_ctx.reply_skb)
2055                 return retcode;
2056         if (retcode != NO_ERROR)
2057                 goto out;
2058
2059         connection = adm_ctx.connection;
2060
2061         new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
2062         if (!new_conf) {
2063                 retcode = ERR_NOMEM;
2064                 goto out;
2065         }
2066
2067         conn_reconfig_start(connection);
2068
2069         mutex_lock(&connection->data.mutex);
2070         mutex_lock(&connection->conf_update);
2071         old_conf = connection->net_conf;
2072
2073         if (!old_conf) {
2074                 drbd_msg_put_info("net conf missing, try connect");
2075                 retcode = ERR_INVALID_REQUEST;
2076                 goto fail;
2077         }
2078
2079         *new_conf = *old_conf;
2080         if (should_set_defaults(info))
2081                 set_net_conf_defaults(new_conf);
2082
2083         err = net_conf_from_attrs_for_change(new_conf, info);
2084         if (err && err != -ENOMSG) {
2085                 retcode = ERR_MANDATORY_TAG;
2086                 drbd_msg_put_info(from_attrs_err_to_txt(err));
2087                 goto fail;
2088         }
2089
2090         retcode = check_net_options(connection, new_conf);
2091         if (retcode != NO_ERROR)
2092                 goto fail;
2093
2094         /* re-sync running */
2095         rsr = conn_resync_running(connection);
2096         if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) {
2097                 retcode = ERR_CSUMS_RESYNC_RUNNING;
2098                 goto fail;
2099         }
2100
2101         /* online verify running */
2102         ovr = conn_ov_running(connection);
2103         if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) {
2104                 retcode = ERR_VERIFY_RUNNING;
2105                 goto fail;
2106         }
2107
2108         retcode = alloc_crypto(&crypto, new_conf);
2109         if (retcode != NO_ERROR)
2110                 goto fail;
2111
2112         rcu_assign_pointer(connection->net_conf, new_conf);
2113
2114         if (!rsr) {
2115                 crypto_free_hash(connection->csums_tfm);
2116                 connection->csums_tfm = crypto.csums_tfm;
2117                 crypto.csums_tfm = NULL;
2118         }
2119         if (!ovr) {
2120                 crypto_free_hash(connection->verify_tfm);
2121                 connection->verify_tfm = crypto.verify_tfm;
2122                 crypto.verify_tfm = NULL;
2123         }
2124
2125         crypto_free_hash(connection->integrity_tfm);
2126         connection->integrity_tfm = crypto.integrity_tfm;
2127         if (connection->cstate >= C_WF_REPORT_PARAMS && connection->agreed_pro_version >= 100)
2128                 /* Do this without trying to take connection->data.mutex again.  */
2129                 __drbd_send_protocol(connection, P_PROTOCOL_UPDATE);
2130
2131         crypto_free_hash(connection->cram_hmac_tfm);
2132         connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
2133
2134         mutex_unlock(&connection->conf_update);
2135         mutex_unlock(&connection->data.mutex);
2136         synchronize_rcu();
2137         kfree(old_conf);
2138
2139         if (connection->cstate >= C_WF_REPORT_PARAMS)
2140                 drbd_send_sync_param(minor_to_device(conn_lowest_minor(connection)));
2141
2142         goto done;
2143
2144  fail:
2145         mutex_unlock(&connection->conf_update);
2146         mutex_unlock(&connection->data.mutex);
2147         free_crypto(&crypto);
2148         kfree(new_conf);
2149  done:
2150         conn_reconfig_done(connection);
2151  out:
2152         drbd_adm_finish(info, retcode);
2153         return 0;
2154 }
2155
2156 int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
2157 {
2158         struct drbd_device *device;
2159         struct net_conf *old_conf, *new_conf = NULL;
2160         struct crypto crypto = { };
2161         struct drbd_resource *resource;
2162         struct drbd_connection *connection;
2163         enum drbd_ret_code retcode;
2164         int i;
2165         int err;
2166
2167         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
2168
2169         if (!adm_ctx.reply_skb)
2170                 return retcode;
2171         if (retcode != NO_ERROR)
2172                 goto out;
2173         if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
2174                 drbd_msg_put_info("connection endpoint(s) missing");
2175                 retcode = ERR_INVALID_REQUEST;
2176                 goto out;
2177         }
2178
2179         /* No need for _rcu here. All reconfiguration is
2180          * strictly serialized on genl_lock(). We are protected against
2181          * concurrent reconfiguration/addition/deletion */
2182         for_each_resource(resource, &drbd_resources) {
2183                 for_each_connection(connection, resource) {
2184                         if (nla_len(adm_ctx.my_addr) == connection->my_addr_len &&
2185                             !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr,
2186                                     connection->my_addr_len)) {
2187                                 retcode = ERR_LOCAL_ADDR;
2188                                 goto out;
2189                         }
2190
2191                         if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len &&
2192                             !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr,
2193                                     connection->peer_addr_len)) {
2194                                 retcode = ERR_PEER_ADDR;
2195                                 goto out;
2196                         }
2197                 }
2198         }
2199
2200         connection = adm_ctx.connection;
2201         conn_reconfig_start(connection);
2202
2203         if (connection->cstate > C_STANDALONE) {
2204                 retcode = ERR_NET_CONFIGURED;
2205                 goto fail;
2206         }
2207
2208         /* allocation not in the IO path, drbdsetup / netlink process context */
2209         new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL);
2210         if (!new_conf) {
2211                 retcode = ERR_NOMEM;
2212                 goto fail;
2213         }
2214
2215         set_net_conf_defaults(new_conf);
2216
2217         err = net_conf_from_attrs(new_conf, info);
2218         if (err && err != -ENOMSG) {
2219                 retcode = ERR_MANDATORY_TAG;
2220                 drbd_msg_put_info(from_attrs_err_to_txt(err));
2221                 goto fail;
2222         }
2223
2224         retcode = check_net_options(connection, new_conf);
2225         if (retcode != NO_ERROR)
2226                 goto fail;
2227
2228         retcode = alloc_crypto(&crypto, new_conf);
2229         if (retcode != NO_ERROR)
2230                 goto fail;
2231
2232         ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
2233
2234         conn_flush_workqueue(connection);
2235
2236         mutex_lock(&connection->conf_update);
2237         old_conf = connection->net_conf;
2238         if (old_conf) {
2239                 retcode = ERR_NET_CONFIGURED;
2240                 mutex_unlock(&connection->conf_update);
2241                 goto fail;
2242         }
2243         rcu_assign_pointer(connection->net_conf, new_conf);
2244
2245         conn_free_crypto(connection);
2246         connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
2247         connection->integrity_tfm = crypto.integrity_tfm;
2248         connection->csums_tfm = crypto.csums_tfm;
2249         connection->verify_tfm = crypto.verify_tfm;
2250
2251         connection->my_addr_len = nla_len(adm_ctx.my_addr);
2252         memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len);
2253         connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
2254         memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
2255
2256         mutex_unlock(&connection->conf_update);
2257
2258         rcu_read_lock();
2259         idr_for_each_entry(&connection->volumes, device, i) {
2260                 device->send_cnt = 0;
2261                 device->recv_cnt = 0;
2262         }
2263         rcu_read_unlock();
2264
2265         retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
2266
2267         conn_reconfig_done(connection);
2268         drbd_adm_finish(info, retcode);
2269         return 0;
2270
2271 fail:
2272         free_crypto(&crypto);
2273         kfree(new_conf);
2274
2275         conn_reconfig_done(connection);
2276 out:
2277         drbd_adm_finish(info, retcode);
2278         return 0;
2279 }
2280
2281 static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection, bool force)
2282 {
2283         enum drbd_state_rv rv;
2284
2285         rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
2286                         force ? CS_HARD : 0);
2287
2288         switch (rv) {
2289         case SS_NOTHING_TO_DO:
2290                 break;
2291         case SS_ALREADY_STANDALONE:
2292                 return SS_SUCCESS;
2293         case SS_PRIMARY_NOP:
2294                 /* Our state checking code wants to see the peer outdated. */
2295                 rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
2296
2297                 if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
2298                         rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_VERBOSE);
2299
2300                 break;
2301         case SS_CW_FAILED_BY_PEER:
2302                 /* The peer probably wants to see us outdated. */
2303                 rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING,
2304                                                         disk, D_OUTDATED), 0);
2305                 if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
2306                         rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
2307                                         CS_HARD);
2308                 }
2309                 break;
2310         default:;
2311                 /* no special handling necessary */
2312         }
2313
2314         if (rv >= SS_SUCCESS) {
2315                 enum drbd_state_rv rv2;
2316                 /* No one else can reconfigure the network while I am here.
2317                  * The state handling only uses drbd_thread_stop_nowait(),
2318                  * we want to really wait here until the receiver is no more.
2319                  */
2320                 drbd_thread_stop(&connection->receiver);
2321
2322                 /* Race breaker.  This additional state change request may be
2323                  * necessary, if this was a forced disconnect during a receiver
2324                  * restart.  We may have "killed" the receiver thread just
2325                  * after drbdd_init() returned.  Typically, we should be
2326                  * C_STANDALONE already, now, and this becomes a no-op.
2327                  */
2328                 rv2 = conn_request_state(connection, NS(conn, C_STANDALONE),
2329                                 CS_VERBOSE | CS_HARD);
2330                 if (rv2 < SS_SUCCESS)
2331                         conn_err(connection,
2332                                 "unexpected rv2=%d in conn_try_disconnect()\n",
2333                                 rv2);
2334         }
2335         return rv;
2336 }
2337
2338 int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
2339 {
2340         struct disconnect_parms parms;
2341         struct drbd_connection *connection;
2342         enum drbd_state_rv rv;
2343         enum drbd_ret_code retcode;
2344         int err;
2345
2346         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
2347         if (!adm_ctx.reply_skb)
2348                 return retcode;
2349         if (retcode != NO_ERROR)
2350                 goto fail;
2351
2352         connection = adm_ctx.connection;
2353         memset(&parms, 0, sizeof(parms));
2354         if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
2355                 err = disconnect_parms_from_attrs(&parms, info);
2356                 if (err) {
2357                         retcode = ERR_MANDATORY_TAG;
2358                         drbd_msg_put_info(from_attrs_err_to_txt(err));
2359                         goto fail;
2360                 }
2361         }
2362
2363         rv = conn_try_disconnect(connection, parms.force_disconnect);
2364         if (rv < SS_SUCCESS)
2365                 retcode = rv;  /* FIXME: Type mismatch. */
2366         else
2367                 retcode = NO_ERROR;
2368  fail:
2369         drbd_adm_finish(info, retcode);
2370         return 0;
2371 }
2372
2373 void resync_after_online_grow(struct drbd_device *device)
2374 {
2375         int iass; /* I am sync source */
2376
2377         dev_info(DEV, "Resync of new storage after online grow\n");
2378         if (device->state.role != device->state.peer)
2379                 iass = (device->state.role == R_PRIMARY);
2380         else
2381                 iass = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
2382
2383         if (iass)
2384                 drbd_start_resync(device, C_SYNC_SOURCE);
2385         else
2386                 _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
2387 }
2388
2389 int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2390 {
2391         struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
2392         struct resize_parms rs;
2393         struct drbd_device *device;
2394         enum drbd_ret_code retcode;
2395         enum determine_dev_size dd;
2396         bool change_al_layout = false;
2397         enum dds_flags ddsf;
2398         sector_t u_size;
2399         int err;
2400
2401         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2402         if (!adm_ctx.reply_skb)
2403                 return retcode;
2404         if (retcode != NO_ERROR)
2405                 goto fail;
2406
2407         device = adm_ctx.device;
2408         if (!get_ldev(device)) {
2409                 retcode = ERR_NO_DISK;
2410                 goto fail;
2411         }
2412
2413         memset(&rs, 0, sizeof(struct resize_parms));
2414         rs.al_stripes = device->ldev->md.al_stripes;
2415         rs.al_stripe_size = device->ldev->md.al_stripe_size_4k * 4;
2416         if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
2417                 err = resize_parms_from_attrs(&rs, info);
2418                 if (err) {
2419                         retcode = ERR_MANDATORY_TAG;
2420                         drbd_msg_put_info(from_attrs_err_to_txt(err));
2421                         goto fail_ldev;
2422                 }
2423         }
2424
2425         if (device->state.conn > C_CONNECTED) {
2426                 retcode = ERR_RESIZE_RESYNC;
2427                 goto fail_ldev;
2428         }
2429
2430         if (device->state.role == R_SECONDARY &&
2431             device->state.peer == R_SECONDARY) {
2432                 retcode = ERR_NO_PRIMARY;
2433                 goto fail_ldev;
2434         }
2435
2436         if (rs.no_resync && first_peer_device(device)->connection->agreed_pro_version < 93) {
2437                 retcode = ERR_NEED_APV_93;
2438                 goto fail_ldev;
2439         }
2440
2441         rcu_read_lock();
2442         u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
2443         rcu_read_unlock();
2444         if (u_size != (sector_t)rs.resize_size) {
2445                 new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
2446                 if (!new_disk_conf) {
2447                         retcode = ERR_NOMEM;
2448                         goto fail_ldev;
2449                 }
2450         }
2451
2452         if (device->ldev->md.al_stripes != rs.al_stripes ||
2453             device->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
2454                 u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
2455
2456                 if (al_size_k > (16 * 1024 * 1024)) {
2457                         retcode = ERR_MD_LAYOUT_TOO_BIG;
2458                         goto fail_ldev;
2459                 }
2460
2461                 if (al_size_k < MD_32kB_SECT/2) {
2462                         retcode = ERR_MD_LAYOUT_TOO_SMALL;
2463                         goto fail_ldev;
2464                 }
2465
2466                 if (device->state.conn != C_CONNECTED) {
2467                         retcode = ERR_MD_LAYOUT_CONNECTED;
2468                         goto fail_ldev;
2469                 }
2470
2471                 change_al_layout = true;
2472         }
2473
2474         if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev))
2475                 device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
2476
2477         if (new_disk_conf) {
2478                 mutex_lock(&first_peer_device(device)->connection->conf_update);
2479                 old_disk_conf = device->ldev->disk_conf;
2480                 *new_disk_conf = *old_disk_conf;
2481                 new_disk_conf->disk_size = (sector_t)rs.resize_size;
2482                 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
2483                 mutex_unlock(&first_peer_device(device)->connection->conf_update);
2484                 synchronize_rcu();
2485                 kfree(old_disk_conf);
2486         }
2487
2488         ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
2489         dd = drbd_determine_dev_size(device, ddsf, change_al_layout ? &rs : NULL);
2490         drbd_md_sync(device);
2491         put_ldev(device);
2492         if (dd == DS_ERROR) {
2493                 retcode = ERR_NOMEM_BITMAP;
2494                 goto fail;
2495         } else if (dd == DS_ERROR_SPACE_MD) {
2496                 retcode = ERR_MD_LAYOUT_NO_FIT;
2497                 goto fail;
2498         } else if (dd == DS_ERROR_SHRINK) {
2499                 retcode = ERR_IMPLICIT_SHRINK;
2500                 goto fail;
2501         }
2502
2503         if (device->state.conn == C_CONNECTED) {
2504                 if (dd == DS_GREW)
2505                         set_bit(RESIZE_PENDING, &device->flags);
2506
2507                 drbd_send_uuids(device);
2508                 drbd_send_sizes(device, 1, ddsf);
2509         }
2510
2511  fail:
2512         drbd_adm_finish(info, retcode);
2513         return 0;
2514
2515  fail_ldev:
2516         put_ldev(device);
2517         goto fail;
2518 }
2519
2520 int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
2521 {
2522         enum drbd_ret_code retcode;
2523         struct res_opts res_opts;
2524         int err;
2525
2526         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
2527         if (!adm_ctx.reply_skb)
2528                 return retcode;
2529         if (retcode != NO_ERROR)
2530                 goto fail;
2531
2532         res_opts = adm_ctx.resource->res_opts;
2533         if (should_set_defaults(info))
2534                 set_res_opts_defaults(&res_opts);
2535
2536         err = res_opts_from_attrs(&res_opts, info);
2537         if (err && err != -ENOMSG) {
2538                 retcode = ERR_MANDATORY_TAG;
2539                 drbd_msg_put_info(from_attrs_err_to_txt(err));
2540                 goto fail;
2541         }
2542
2543         err = set_resource_options(adm_ctx.resource, &res_opts);
2544         if (err) {
2545                 retcode = ERR_INVALID_REQUEST;
2546                 if (err == -ENOMEM)
2547                         retcode = ERR_NOMEM;
2548         }
2549
2550 fail:
2551         drbd_adm_finish(info, retcode);
2552         return 0;
2553 }
2554
2555 int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
2556 {
2557         struct drbd_device *device;
2558         int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
2559
2560         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2561         if (!adm_ctx.reply_skb)
2562                 return retcode;
2563         if (retcode != NO_ERROR)
2564                 goto out;
2565
2566         device = adm_ctx.device;
2567
2568         /* If there is still bitmap IO pending, probably because of a previous
2569          * resync just being finished, wait for it before requesting a new resync.
2570          * Also wait for it's after_state_ch(). */
2571         drbd_suspend_io(device);
2572         wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
2573         drbd_flush_workqueue(device);
2574
2575         /* If we happen to be C_STANDALONE R_SECONDARY, just change to
2576          * D_INCONSISTENT, and set all bits in the bitmap.  Otherwise,
2577          * try to start a resync handshake as sync target for full sync.
2578          */
2579         if (device->state.conn == C_STANDALONE && device->state.role == R_SECONDARY) {
2580                 retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT));
2581                 if (retcode >= SS_SUCCESS) {
2582                         if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
2583                                 "set_n_write from invalidate", BM_LOCKED_MASK))
2584                                 retcode = ERR_IO_MD_DISK;
2585                 }
2586         } else
2587                 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
2588         drbd_resume_io(device);
2589
2590 out:
2591         drbd_adm_finish(info, retcode);
2592         return 0;
2593 }
2594
2595 static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
2596                 union drbd_state mask, union drbd_state val)
2597 {
2598         enum drbd_ret_code retcode;
2599
2600         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2601         if (!adm_ctx.reply_skb)
2602                 return retcode;
2603         if (retcode != NO_ERROR)
2604                 goto out;
2605
2606         retcode = drbd_request_state(adm_ctx.device, mask, val);
2607 out:
2608         drbd_adm_finish(info, retcode);
2609         return 0;
2610 }
2611
2612 static int drbd_bmio_set_susp_al(struct drbd_device *device)
2613 {
2614         int rv;
2615
2616         rv = drbd_bmio_set_n_write(device);
2617         drbd_suspend_al(device);
2618         return rv;
2619 }
2620
2621 int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
2622 {
2623         int retcode; /* drbd_ret_code, drbd_state_rv */
2624         struct drbd_device *device;
2625
2626         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2627         if (!adm_ctx.reply_skb)
2628                 return retcode;
2629         if (retcode != NO_ERROR)
2630                 goto out;
2631
2632         device = adm_ctx.device;
2633
2634         /* If there is still bitmap IO pending, probably because of a previous
2635          * resync just being finished, wait for it before requesting a new resync.
2636          * Also wait for it's after_state_ch(). */
2637         drbd_suspend_io(device);
2638         wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
2639         drbd_flush_workqueue(device);
2640
2641         /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
2642          * in the bitmap.  Otherwise, try to start a resync handshake
2643          * as sync source for full sync.
2644          */
2645         if (device->state.conn == C_STANDALONE && device->state.role == R_PRIMARY) {
2646                 /* The peer will get a resync upon connect anyways. Just make that
2647                    into a full resync. */
2648                 retcode = drbd_request_state(device, NS(pdsk, D_INCONSISTENT));
2649                 if (retcode >= SS_SUCCESS) {
2650                         if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al,
2651                                 "set_n_write from invalidate_peer",
2652                                 BM_LOCKED_SET_ALLOWED))
2653                                 retcode = ERR_IO_MD_DISK;
2654                 }
2655         } else
2656                 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
2657         drbd_resume_io(device);
2658
2659 out:
2660         drbd_adm_finish(info, retcode);
2661         return 0;
2662 }
2663
2664 int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
2665 {
2666         enum drbd_ret_code retcode;
2667
2668         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2669         if (!adm_ctx.reply_skb)
2670                 return retcode;
2671         if (retcode != NO_ERROR)
2672                 goto out;
2673
2674         if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
2675                 retcode = ERR_PAUSE_IS_SET;
2676 out:
2677         drbd_adm_finish(info, retcode);
2678         return 0;
2679 }
2680
2681 int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
2682 {
2683         union drbd_dev_state s;
2684         enum drbd_ret_code retcode;
2685
2686         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2687         if (!adm_ctx.reply_skb)
2688                 return retcode;
2689         if (retcode != NO_ERROR)
2690                 goto out;
2691
2692         if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
2693                 s = adm_ctx.device->state;
2694                 if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
2695                         retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
2696                                   s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
2697                 } else {
2698                         retcode = ERR_PAUSE_IS_CLEAR;
2699                 }
2700         }
2701
2702 out:
2703         drbd_adm_finish(info, retcode);
2704         return 0;
2705 }
2706
2707 int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
2708 {
2709         return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
2710 }
2711
2712 int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
2713 {
2714         struct drbd_device *device;
2715         int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
2716
2717         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2718         if (!adm_ctx.reply_skb)
2719                 return retcode;
2720         if (retcode != NO_ERROR)
2721                 goto out;
2722
2723         device = adm_ctx.device;
2724         if (test_bit(NEW_CUR_UUID, &device->flags)) {
2725                 drbd_uuid_new_current(device);
2726                 clear_bit(NEW_CUR_UUID, &device->flags);
2727         }
2728         drbd_suspend_io(device);
2729         retcode = drbd_request_state(device, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
2730         if (retcode == SS_SUCCESS) {
2731                 if (device->state.conn < C_CONNECTED)
2732                         tl_clear(first_peer_device(device)->connection);
2733                 if (device->state.disk == D_DISKLESS || device->state.disk == D_FAILED)
2734                         tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO);
2735         }
2736         drbd_resume_io(device);
2737
2738 out:
2739         drbd_adm_finish(info, retcode);
2740         return 0;
2741 }
2742
2743 int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
2744 {
2745         return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
2746 }
2747
2748 static int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_connection *connection, unsigned vnr)
2749 {
2750         struct nlattr *nla;
2751         nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
2752         if (!nla)
2753                 goto nla_put_failure;
2754         if (vnr != VOLUME_UNSPECIFIED &&
2755             nla_put_u32(skb, T_ctx_volume, vnr))
2756                 goto nla_put_failure;
2757         if (nla_put_string(skb, T_ctx_resource_name, connection->resource->name))
2758                 goto nla_put_failure;
2759         if (connection->my_addr_len &&
2760             nla_put(skb, T_ctx_my_addr, connection->my_addr_len, &connection->my_addr))
2761                 goto nla_put_failure;
2762         if (connection->peer_addr_len &&
2763             nla_put(skb, T_ctx_peer_addr, connection->peer_addr_len, &connection->peer_addr))
2764                 goto nla_put_failure;
2765         nla_nest_end(skb, nla);
2766         return 0;
2767
2768 nla_put_failure:
2769         if (nla)
2770                 nla_nest_cancel(skb, nla);
2771         return -EMSGSIZE;
2772 }
2773
2774 static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
2775                 const struct sib_info *sib)
2776 {
2777         struct state_info *si = NULL; /* for sizeof(si->member); */
2778         struct nlattr *nla;
2779         int got_ldev;
2780         int err = 0;
2781         int exclude_sensitive;
2782
2783         /* If sib != NULL, this is drbd_bcast_event, which anyone can listen
2784          * to.  So we better exclude_sensitive information.
2785          *
2786          * If sib == NULL, this is drbd_adm_get_status, executed synchronously
2787          * in the context of the requesting user process. Exclude sensitive
2788          * information, unless current has superuser.
2789          *
2790          * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
2791          * relies on the current implementation of netlink_dump(), which
2792          * executes the dump callback successively from netlink_recvmsg(),
2793          * always in the context of the receiving process */
2794         exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
2795
2796         got_ldev = get_ldev(device);
2797
2798         /* We need to add connection name and volume number information still.
2799          * Minor number is in drbd_genlmsghdr. */
2800         if (nla_put_drbd_cfg_context(skb, first_peer_device(device)->connection, device->vnr))
2801                 goto nla_put_failure;
2802
2803         if (res_opts_to_skb(skb, &device->resource->res_opts, exclude_sensitive))
2804                 goto nla_put_failure;
2805
2806         rcu_read_lock();
2807         if (got_ldev) {
2808                 struct disk_conf *disk_conf;
2809
2810                 disk_conf = rcu_dereference(device->ldev->disk_conf);
2811                 err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
2812         }
2813         if (!err) {
2814                 struct net_conf *nc;
2815
2816                 nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
2817                 if (nc)
2818                         err = net_conf_to_skb(skb, nc, exclude_sensitive);
2819         }
2820         rcu_read_unlock();
2821         if (err)
2822                 goto nla_put_failure;
2823
2824         nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
2825         if (!nla)
2826                 goto nla_put_failure;
2827         if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
2828             nla_put_u32(skb, T_current_state, device->state.i) ||
2829             nla_put_u64(skb, T_ed_uuid, device->ed_uuid) ||
2830             nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) ||
2831             nla_put_u64(skb, T_send_cnt, device->send_cnt) ||
2832             nla_put_u64(skb, T_recv_cnt, device->recv_cnt) ||
2833             nla_put_u64(skb, T_read_cnt, device->read_cnt) ||
2834             nla_put_u64(skb, T_writ_cnt, device->writ_cnt) ||
2835             nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) ||
2836             nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
2837             nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) ||
2838             nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) ||
2839             nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt)))
2840                 goto nla_put_failure;
2841
2842         if (got_ldev) {
2843                 int err;
2844
2845                 spin_lock_irq(&device->ldev->md.uuid_lock);
2846                 err = nla_put(skb, T_uuids, sizeof(si->uuids), device->ldev->md.uuid);
2847                 spin_unlock_irq(&device->ldev->md.uuid_lock);
2848
2849                 if (err)
2850                         goto nla_put_failure;
2851
2852                 if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) ||
2853                     nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) ||
2854                     nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device)))
2855                         goto nla_put_failure;
2856                 if (C_SYNC_SOURCE <= device->state.conn &&
2857                     C_PAUSED_SYNC_T >= device->state.conn) {
2858                         if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) ||
2859                             nla_put_u64(skb, T_bits_rs_failed, device->rs_failed))
2860                                 goto nla_put_failure;
2861                 }
2862         }
2863
2864         if (sib) {
2865                 switch(sib->sib_reason) {
2866                 case SIB_SYNC_PROGRESS:
2867                 case SIB_GET_STATUS_REPLY:
2868                         break;
2869                 case SIB_STATE_CHANGE:
2870                         if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
2871                             nla_put_u32(skb, T_new_state, sib->ns.i))
2872                                 goto nla_put_failure;
2873                         break;
2874                 case SIB_HELPER_POST:
2875                         if (nla_put_u32(skb, T_helper_exit_code,
2876                                         sib->helper_exit_code))
2877                                 goto nla_put_failure;
2878                         /* fall through */
2879                 case SIB_HELPER_PRE:
2880                         if (nla_put_string(skb, T_helper, sib->helper_name))
2881                                 goto nla_put_failure;
2882                         break;
2883                 }
2884         }
2885         nla_nest_end(skb, nla);
2886
2887         if (0)
2888 nla_put_failure:
2889                 err = -EMSGSIZE;
2890         if (got_ldev)
2891                 put_ldev(device);
2892         return err;
2893 }
2894
2895 int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
2896 {
2897         enum drbd_ret_code retcode;
2898         int err;
2899
2900         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2901         if (!adm_ctx.reply_skb)
2902                 return retcode;
2903         if (retcode != NO_ERROR)
2904                 goto out;
2905
2906         err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL);
2907         if (err) {
2908                 nlmsg_free(adm_ctx.reply_skb);
2909                 return err;
2910         }
2911 out:
2912         drbd_adm_finish(info, retcode);
2913         return 0;
2914 }
2915
2916 static int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
2917 {
2918         struct drbd_device *device;
2919         struct drbd_genlmsghdr *dh;
2920         struct drbd_resource *pos = (struct drbd_resource *)cb->args[0];
2921         struct drbd_resource *resource = NULL;
2922         struct drbd_connection *connection;
2923         struct drbd_resource *tmp;
2924         unsigned volume = cb->args[1];
2925
2926         /* Open coded, deferred, iteration:
2927          * for_each_resource_safe(resource, tmp, &drbd_resources) {
2928          *      connection = "first connection of resource";
2929          *      idr_for_each_entry(&connection->volumes, device, i) {
2930          *        ...
2931          *      }
2932          * }
2933          * where resource is cb->args[0];
2934          * and i is cb->args[1];
2935          *
2936          * cb->args[2] indicates if we shall loop over all resources,
2937          * or just dump all volumes of a single resource.
2938          *
2939          * This may miss entries inserted after this dump started,
2940          * or entries deleted before they are reached.
2941          *
2942          * We need to make sure the device won't disappear while
2943          * we are looking at it, and revalidate our iterators
2944          * on each iteration.
2945          */
2946
2947         /* synchronize with conn_create()/drbd_destroy_connection() */
2948         rcu_read_lock();
2949         /* revalidate iterator position */
2950         for_each_resource_rcu(tmp, &drbd_resources) {
2951                 if (pos == NULL) {
2952                         /* first iteration */
2953                         pos = tmp;
2954                         resource = pos;
2955                         break;
2956                 }
2957                 if (tmp == pos) {
2958                         resource = pos;
2959                         break;
2960                 }
2961         }
2962         if (resource) {
2963 next_resource:
2964                 connection = first_connection(resource);
2965                 device = idr_get_next(&connection->volumes, &volume);
2966                 if (!device) {
2967                         /* No more volumes to dump on this resource.
2968                          * Advance resource iterator. */
2969                         pos = list_entry_rcu(resource->resources.next,
2970                                              struct drbd_resource, resources);
2971                         /* Did we dump any volume of this resource yet? */
2972                         if (volume != 0) {
2973                                 /* If we reached the end of the list,
2974                                  * or only a single resource dump was requested,
2975                                  * we are done. */
2976                                 if (&pos->resources == &drbd_resources || cb->args[2])
2977                                         goto out;
2978                                 volume = 0;
2979                                 resource = pos;
2980                                 goto next_resource;
2981                         }
2982                 }
2983
2984                 dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
2985                                 cb->nlh->nlmsg_seq, &drbd_genl_family,
2986                                 NLM_F_MULTI, DRBD_ADM_GET_STATUS);
2987                 if (!dh)
2988                         goto out;
2989
2990                 if (!device) {
2991                         /* This is a connection without a single volume.
2992                          * Suprisingly enough, it may have a network
2993                          * configuration. */
2994                         struct net_conf *nc;
2995                         dh->minor = -1U;
2996                         dh->ret_code = NO_ERROR;
2997                         if (nla_put_drbd_cfg_context(skb, connection, VOLUME_UNSPECIFIED))
2998                                 goto cancel;
2999                         nc = rcu_dereference(connection->net_conf);
3000                         if (nc && net_conf_to_skb(skb, nc, 1) != 0)
3001                                 goto cancel;
3002                         goto done;
3003                 }
3004
3005                 D_ASSERT(device->vnr == volume);
3006                 D_ASSERT(first_peer_device(device)->connection == connection);
3007
3008                 dh->minor = device_to_minor(device);
3009                 dh->ret_code = NO_ERROR;
3010
3011                 if (nla_put_status_info(skb, device, NULL)) {
3012 cancel:
3013                         genlmsg_cancel(skb, dh);
3014                         goto out;
3015                 }
3016 done:
3017                 genlmsg_end(skb, dh);
3018         }
3019
3020 out:
3021         rcu_read_unlock();
3022         /* where to start the next iteration */
3023         cb->args[0] = (long)pos;
3024         cb->args[1] = (pos == resource) ? volume + 1 : 0;
3025
3026         /* No more resources/volumes/minors found results in an empty skb.
3027          * Which will terminate the dump. */
3028         return skb->len;
3029 }
3030
3031 /*
3032  * Request status of all resources, or of all volumes within a single resource.
3033  *
3034  * This is a dump, as the answer may not fit in a single reply skb otherwise.
3035  * Which means we cannot use the family->attrbuf or other such members, because
3036  * dump is NOT protected by the genl_lock().  During dump, we only have access
3037  * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
3038  *
3039  * Once things are setup properly, we call into get_one_status().
3040  */
3041 int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
3042 {
3043         const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
3044         struct nlattr *nla;
3045         const char *resource_name;
3046         struct drbd_resource *resource;
3047         int maxtype;
3048
3049         /* Is this a followup call? */
3050         if (cb->args[0]) {
3051                 /* ... of a single resource dump,
3052                  * and the resource iterator has been advanced already? */
3053                 if (cb->args[2] && cb->args[2] != cb->args[0])
3054                         return 0; /* DONE. */
3055                 goto dump;
3056         }
3057
3058         /* First call (from netlink_dump_start).  We need to figure out
3059          * which resource(s) the user wants us to dump. */
3060         nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
3061                         nlmsg_attrlen(cb->nlh, hdrlen),
3062                         DRBD_NLA_CFG_CONTEXT);
3063
3064         /* No explicit context given.  Dump all. */
3065         if (!nla)
3066                 goto dump;
3067         maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
3068         nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
3069         if (IS_ERR(nla))
3070                 return PTR_ERR(nla);
3071         /* context given, but no name present? */
3072         if (!nla)
3073                 return -EINVAL;
3074         resource_name = nla_data(nla);
3075         if (!*resource_name)
3076                 return -ENODEV;
3077         resource = drbd_find_resource(resource_name);
3078         if (!resource)
3079                 return -ENODEV;
3080
3081         kref_put(&resource->kref, drbd_destroy_resource); /* get_one_status() revalidates the resource */
3082
3083         /* prime iterators, and set "filter" mode mark:
3084          * only dump this connection. */
3085         cb->args[0] = (long)resource;
3086         /* cb->args[1] = 0; passed in this way. */
3087         cb->args[2] = (long)resource;
3088
3089 dump:
3090         return get_one_status(skb, cb);
3091 }
3092
3093 int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
3094 {
3095         enum drbd_ret_code retcode;
3096         struct timeout_parms tp;
3097         int err;
3098
3099         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3100         if (!adm_ctx.reply_skb)
3101                 return retcode;
3102         if (retcode != NO_ERROR)
3103                 goto out;
3104
3105         tp.timeout_type =
3106                 adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
3107                 test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED :
3108                 UT_DEFAULT;
3109
3110         err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
3111         if (err) {
3112                 nlmsg_free(adm_ctx.reply_skb);
3113                 return err;
3114         }
3115 out:
3116         drbd_adm_finish(info, retcode);
3117         return 0;
3118 }
3119
3120 int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
3121 {
3122         struct drbd_device *device;
3123         enum drbd_ret_code retcode;
3124         struct start_ov_parms parms;
3125
3126         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3127         if (!adm_ctx.reply_skb)
3128                 return retcode;
3129         if (retcode != NO_ERROR)
3130                 goto out;
3131
3132         device = adm_ctx.device;
3133
3134         /* resume from last known position, if possible */
3135         parms.ov_start_sector = device->ov_start_sector;
3136         parms.ov_stop_sector = ULLONG_MAX;
3137         if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
3138                 int err = start_ov_parms_from_attrs(&parms, info);
3139                 if (err) {
3140                         retcode = ERR_MANDATORY_TAG;
3141                         drbd_msg_put_info(from_attrs_err_to_txt(err));
3142                         goto out;
3143                 }
3144         }
3145         /* w_make_ov_request expects position to be aligned */
3146         device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
3147         device->ov_stop_sector = parms.ov_stop_sector;
3148
3149         /* If there is still bitmap IO pending, e.g. previous resync or verify
3150          * just being finished, wait for it before requesting a new resync. */
3151         drbd_suspend_io(device);
3152         wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
3153         retcode = drbd_request_state(device, NS(conn, C_VERIFY_S));
3154         drbd_resume_io(device);
3155 out:
3156         drbd_adm_finish(info, retcode);
3157         return 0;
3158 }
3159
3160
3161 int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
3162 {
3163         struct drbd_device *device;
3164         enum drbd_ret_code retcode;
3165         int skip_initial_sync = 0;
3166         int err;
3167         struct new_c_uuid_parms args;
3168
3169         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3170         if (!adm_ctx.reply_skb)
3171                 return retcode;
3172         if (retcode != NO_ERROR)
3173                 goto out_nolock;
3174
3175         device = adm_ctx.device;
3176         memset(&args, 0, sizeof(args));
3177         if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
3178                 err = new_c_uuid_parms_from_attrs(&args, info);
3179                 if (err) {
3180                         retcode = ERR_MANDATORY_TAG;
3181                         drbd_msg_put_info(from_attrs_err_to_txt(err));
3182                         goto out_nolock;
3183                 }
3184         }
3185
3186         mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */
3187
3188         if (!get_ldev(device)) {
3189                 retcode = ERR_NO_DISK;
3190                 goto out;
3191         }
3192
3193         /* this is "skip initial sync", assume to be clean */
3194         if (device->state.conn == C_CONNECTED &&
3195             first_peer_device(device)->connection->agreed_pro_version >= 90 &&
3196             device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
3197                 dev_info(DEV, "Preparing to skip initial sync\n");
3198                 skip_initial_sync = 1;
3199         } else if (device->state.conn != C_STANDALONE) {
3200                 retcode = ERR_CONNECTED;
3201                 goto out_dec;
3202         }
3203
3204         drbd_uuid_set(device, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
3205         drbd_uuid_new_current(device); /* New current, previous to UI_BITMAP */
3206
3207         if (args.clear_bm) {
3208                 err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
3209                         "clear_n_write from new_c_uuid", BM_LOCKED_MASK);
3210                 if (err) {
3211                         dev_err(DEV, "Writing bitmap failed with %d\n",err);
3212                         retcode = ERR_IO_MD_DISK;
3213                 }
3214                 if (skip_initial_sync) {
3215                         drbd_send_uuids_skip_initial_sync(device);
3216                         _drbd_uuid_set(device, UI_BITMAP, 0);
3217                         drbd_print_uuids(device, "cleared bitmap UUID");
3218                         spin_lock_irq(&first_peer_device(device)->connection->req_lock);
3219                         _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3220                                         CS_VERBOSE, NULL);
3221                         spin_unlock_irq(&first_peer_device(device)->connection->req_lock);
3222                 }
3223         }
3224
3225         drbd_md_sync(device);
3226 out_dec:
3227         put_ldev(device);
3228 out:
3229         mutex_unlock(device->state_mutex);
3230 out_nolock:
3231         drbd_adm_finish(info, retcode);
3232         return 0;
3233 }
3234
3235 static enum drbd_ret_code
3236 drbd_check_resource_name(const char *name)
3237 {
3238         if (!name || !name[0]) {
3239                 drbd_msg_put_info("resource name missing");
3240                 return ERR_MANDATORY_TAG;
3241         }
3242         /* if we want to use these in sysfs/configfs/debugfs some day,
3243          * we must not allow slashes */
3244         if (strchr(name, '/')) {
3245                 drbd_msg_put_info("invalid resource name");
3246                 return ERR_INVALID_REQUEST;
3247         }
3248         return NO_ERROR;
3249 }
3250
3251 int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
3252 {
3253         enum drbd_ret_code retcode;
3254         struct res_opts res_opts;
3255         int err;
3256
3257         retcode = drbd_adm_prepare(skb, info, 0);
3258         if (!adm_ctx.reply_skb)
3259                 return retcode;
3260         if (retcode != NO_ERROR)
3261                 goto out;
3262
3263         set_res_opts_defaults(&res_opts);
3264         err = res_opts_from_attrs(&res_opts, info);
3265         if (err && err != -ENOMSG) {
3266                 retcode = ERR_MANDATORY_TAG;
3267                 drbd_msg_put_info(from_attrs_err_to_txt(err));
3268                 goto out;
3269         }
3270
3271         retcode = drbd_check_resource_name(adm_ctx.resource_name);
3272         if (retcode != NO_ERROR)
3273                 goto out;
3274
3275         if (adm_ctx.connection) {
3276                 if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
3277                         retcode = ERR_INVALID_REQUEST;
3278                         drbd_msg_put_info("resource exists");
3279                 }
3280                 /* else: still NO_ERROR */
3281                 goto out;
3282         }
3283
3284         if (!conn_create(adm_ctx.resource_name, &res_opts))
3285                 retcode = ERR_NOMEM;
3286 out:
3287         drbd_adm_finish(info, retcode);
3288         return 0;
3289 }
3290
3291 int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
3292 {
3293         struct drbd_genlmsghdr *dh = info->userhdr;
3294         enum drbd_ret_code retcode;
3295
3296         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
3297         if (!adm_ctx.reply_skb)
3298                 return retcode;
3299         if (retcode != NO_ERROR)
3300                 goto out;
3301
3302         if (dh->minor > MINORMASK) {
3303                 drbd_msg_put_info("requested minor out of range");
3304                 retcode = ERR_INVALID_REQUEST;
3305                 goto out;
3306         }
3307         if (adm_ctx.volume > DRBD_VOLUME_MAX) {
3308                 drbd_msg_put_info("requested volume id out of range");
3309                 retcode = ERR_INVALID_REQUEST;
3310                 goto out;
3311         }
3312
3313         /* drbd_adm_prepare made sure already
3314          * that first_peer_device(device)->connection and device->vnr match the request. */
3315         if (adm_ctx.device) {
3316                 if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
3317                         retcode = ERR_MINOR_EXISTS;
3318                 /* else: still NO_ERROR */
3319                 goto out;
3320         }
3321
3322         retcode = drbd_create_minor(adm_ctx.connection, dh->minor, adm_ctx.volume);
3323 out:
3324         drbd_adm_finish(info, retcode);
3325         return 0;
3326 }
3327
3328 static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
3329 {
3330         if (device->state.disk == D_DISKLESS &&
3331             /* no need to be device->state.conn == C_STANDALONE &&
3332              * we may want to delete a minor from a live replication group.
3333              */
3334             device->state.role == R_SECONDARY) {
3335                 _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
3336                                     CS_VERBOSE + CS_WAIT_COMPLETE);
3337                 drbd_delete_minor(device);
3338                 return NO_ERROR;
3339         } else
3340                 return ERR_MINOR_CONFIGURED;
3341 }
3342
3343 int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
3344 {
3345         enum drbd_ret_code retcode;
3346
3347         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3348         if (!adm_ctx.reply_skb)
3349                 return retcode;
3350         if (retcode != NO_ERROR)
3351                 goto out;
3352
3353         retcode = adm_del_minor(adm_ctx.device);
3354 out:
3355         drbd_adm_finish(info, retcode);
3356         return 0;
3357 }
3358
3359 int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
3360 {
3361         int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
3362         struct drbd_device *device;
3363         unsigned i;
3364
3365         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
3366         if (!adm_ctx.reply_skb)
3367                 return retcode;
3368         if (retcode != NO_ERROR)
3369                 goto out;
3370
3371         /* demote */
3372         idr_for_each_entry(&adm_ctx.connection->volumes, device, i) {
3373                 retcode = drbd_set_role(device, R_SECONDARY, 0);
3374                 if (retcode < SS_SUCCESS) {
3375                         drbd_msg_put_info("failed to demote");
3376                         goto out;
3377                 }
3378         }
3379
3380         retcode = conn_try_disconnect(adm_ctx.connection, 0);
3381         if (retcode < SS_SUCCESS) {
3382                 drbd_msg_put_info("failed to disconnect");
3383                 goto out;
3384         }
3385
3386         /* detach */
3387         idr_for_each_entry(&adm_ctx.connection->volumes, device, i) {
3388                 retcode = adm_detach(device, 0);
3389                 if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
3390                         drbd_msg_put_info("failed to detach");
3391                         goto out;
3392                 }
3393         }
3394
3395         /* If we reach this, all volumes (of this connection) are Secondary,
3396          * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
3397          * actually stopped, state handling only does drbd_thread_stop_nowait(). */
3398         drbd_thread_stop(&adm_ctx.connection->worker);
3399
3400         /* Now, nothing can fail anymore */
3401
3402         /* delete volumes */
3403         idr_for_each_entry(&adm_ctx.connection->volumes, device, i) {
3404                 retcode = adm_del_minor(device);
3405                 if (retcode != NO_ERROR) {
3406                         /* "can not happen" */
3407                         drbd_msg_put_info("failed to delete volume");
3408                         goto out;
3409                 }
3410         }
3411
3412         /* delete connection */
3413         if (conn_lowest_minor(adm_ctx.connection) < 0) {
3414                 struct drbd_resource *resource = adm_ctx.connection->resource;
3415
3416                 list_del_rcu(&resource->resources);
3417                 synchronize_rcu();
3418                 drbd_free_resource(resource);
3419
3420                 retcode = NO_ERROR;
3421         } else {
3422                 /* "can not happen" */
3423                 retcode = ERR_RES_IN_USE;
3424                 drbd_msg_put_info("failed to delete connection");
3425         }
3426         goto out;
3427 out:
3428         drbd_adm_finish(info, retcode);
3429         return 0;
3430 }
3431
3432 int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
3433 {
3434         struct drbd_resource *resource;
3435         struct drbd_connection *connection;
3436         enum drbd_ret_code retcode;
3437
3438         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
3439         if (!adm_ctx.reply_skb)
3440                 return retcode;
3441         if (retcode != NO_ERROR)
3442                 goto out;
3443
3444         resource = adm_ctx.resource;
3445         for_each_connection(connection, resource) {
3446                 if (connection->cstate > C_STANDALONE) {
3447                         retcode = ERR_NET_CONFIGURED;
3448                         goto out;
3449                 }
3450         }
3451         if (!idr_is_empty(&resource->devices)) {
3452                 retcode = ERR_RES_IN_USE;
3453                 goto out;
3454         }
3455
3456         list_del_rcu(&resource->resources);
3457         for_each_connection(connection, resource)
3458                 drbd_thread_stop(&connection->worker);
3459         synchronize_rcu();
3460         drbd_free_resource(resource);
3461         retcode = NO_ERROR;
3462 out:
3463         drbd_adm_finish(info, retcode);
3464         return 0;
3465 }
3466
3467 void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
3468 {
3469         static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
3470         struct sk_buff *msg;
3471         struct drbd_genlmsghdr *d_out;
3472         unsigned seq;
3473         int err = -ENOMEM;
3474
3475         if (sib->sib_reason == SIB_SYNC_PROGRESS) {
3476                 if (time_after(jiffies, device->rs_last_bcast + HZ))
3477                         device->rs_last_bcast = jiffies;
3478                 else
3479                         return;
3480         }
3481
3482         seq = atomic_inc_return(&drbd_genl_seq);
3483         msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
3484         if (!msg)
3485                 goto failed;
3486
3487         err = -EMSGSIZE;
3488         d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
3489         if (!d_out) /* cannot happen, but anyways. */
3490                 goto nla_put_failure;
3491         d_out->minor = device_to_minor(device);
3492         d_out->ret_code = NO_ERROR;
3493
3494         if (nla_put_status_info(msg, device, sib))
3495                 goto nla_put_failure;
3496         genlmsg_end(msg, d_out);
3497         err = drbd_genl_multicast_events(msg, 0);
3498         /* msg has been consumed or freed in netlink_broadcast() */
3499         if (err && err != -ESRCH)
3500                 goto failed;
3501
3502         return;
3503
3504 nla_put_failure:
3505         nlmsg_free(msg);
3506 failed:
3507         dev_err(DEV, "Error %d while broadcasting event. "
3508                         "Event seq:%u sib_reason:%u\n",
3509                         err, seq, sib->sib_reason);
3510 }