1 /* 2 * Copyright 2006-2008, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Galante, haiku.galante@gmail.com 7 * Axel Dörfler, axeld@pinc-software.de 8 * Hugo Santos, hugosantos@gmail.com 9 */ 10 11 12 #include "TCPEndpoint.h" 13 #include "EndpointManager.h" 14 15 #include <net_buffer.h> 16 #include <net_datalink.h> 17 #include <net_stat.h> 18 #include <NetBufferUtilities.h> 19 #include <NetUtilities.h> 20 21 #include <lock.h> 22 #include <util/AutoLock.h> 23 #include <util/khash.h> 24 #include <util/list.h> 25 26 #include <KernelExport.h> 27 #include <Select.h> 28 29 #include <netinet/in.h> 30 #include <netinet/ip.h> 31 #include <netinet/tcp.h> 32 #include <new> 33 #include <stdlib.h> 34 #include <string.h> 35 36 37 // References: 38 // - RFC 793 - Transmission Control Protocol 39 // - RFC 813 - Window and Acknowledgement Strategy in TCP 40 // 41 // Things this implementation currently doesn't implement: 42 // 43 // TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, RFC 2001, RFC 2581, RFC 3042 44 // NewReno Modification to TCP's Fast Recovery, RFC 2582 45 // Explicit Congestion Notification (ECN), RFC 3168 46 // SYN-Cache 47 // TCP Extensions for High Performance, RFC 1323 48 // SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517 49 // Forward RTO-Recovery, RFC 4138 50 51 #define PrintAddress(address) \ 52 AddressString(Domain(), address, true).Data() 53 54 //#define TRACE_TCP 55 //#define PROBE_TCP 56 57 #ifdef TRACE_TCP 58 // the space before ', ##args' is important in order for this to work with cpp 2.95 59 # define TRACE(format, args...) dprintf("TCP [%llu] %p (%12s) " format "\n", \ 60 system_time(), this, name_for_state(fState) , ##args) 61 #else 62 # define TRACE(args...) do { } while (0) 63 #endif 64 65 #ifdef PROBE_TCP 66 # define PROBE(buffer, window) \ 67 dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \ 68 system_time(), PrintAddress(buffer->source), \ 69 PrintAddress(buffer->destination), buffer->size, (uint32)fSendNext, \ 70 (uint32)fSendUnacknowledged, fCongestionWindow, fSlowStartThreshold, \ 71 window, fSendWindow, (uint32)(fSendMax - fSendUnacknowledged), \ 72 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout) 73 #else 74 # define PROBE(buffer, window) do { } while (0) 75 #endif 76 77 // Initial estimate for packet round trip time (RTT) 78 #define TCP_INITIAL_RTT 2000000 79 80 // constants for the fFlags field 81 enum { 82 FLAG_OPTION_WINDOW_SCALE = 0x01, 83 FLAG_OPTION_TIMESTAMP = 0x02, 84 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections? 85 // That is, what is expected from accept() after a shutdown() 86 // is performed on a listen()ing socket. 87 FLAG_NO_RECEIVE = 0x04, 88 }; 89 90 91 static const int kTimestampFactor = 1024; 92 93 94 static inline bigtime_t 95 absolute_timeout(bigtime_t timeout) 96 { 97 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT) 98 return timeout; 99 100 return timeout + system_time(); 101 } 102 103 104 static inline status_t 105 posix_error(status_t error) 106 { 107 if (error == B_TIMED_OUT) 108 return B_WOULD_BLOCK; 109 110 return error; 111 } 112 113 114 static inline bool 115 in_window(const tcp_sequence &sequence, const tcp_sequence &rcvNext, 116 uint32 rcvWindow) 117 { 118 return sequence >= rcvNext && sequence < (rcvNext + rcvWindow); 119 } 120 121 122 static inline bool 123 segment_in_sequence(const tcp_segment_header &segment, int size, 124 const tcp_sequence &rcvNext, uint32 rcvWindow) 125 { 126 tcp_sequence sequence(segment.sequence); 127 if (size == 0) { 128 if (rcvWindow == 0) 129 return sequence == rcvNext; 130 return in_window(sequence, rcvNext, rcvWindow); 131 } else { 132 if (rcvWindow == 0) 133 return false; 134 return in_window(sequence, rcvNext, rcvWindow) 135 || in_window(sequence + size - 1, rcvNext, rcvWindow); 136 } 137 } 138 139 140 static inline bool 141 is_writable(tcp_state state) 142 { 143 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED 144 || state == ESTABLISHED || state == FINISH_RECEIVED; 145 } 146 147 148 static inline uint32 tcp_now() 149 { 150 return system_time() / kTimestampFactor; 151 } 152 153 154 static inline uint32 tcp_diff_timestamp(uint32 base) 155 { 156 uint32 now = tcp_now(); 157 158 if (now > base) 159 return now - base; 160 161 return now + UINT_MAX - base; 162 } 163 164 165 static inline bool 166 state_needs_finish(int32 state) 167 { 168 return state == WAIT_FOR_FINISH_ACKNOWLEDGE 169 || state == FINISH_SENT || state == CLOSING; 170 } 171 172 173 // #pragma mark - 174 175 176 WaitList::WaitList(const char *name) 177 { 178 fCondition = 0; 179 fSem = create_sem(0, name); 180 } 181 182 183 WaitList::~WaitList() 184 { 185 delete_sem(fSem); 186 } 187 188 189 status_t 190 WaitList::InitCheck() const 191 { 192 return fSem; 193 } 194 195 196 status_t 197 WaitList::Wait(MutexLocker &locker, bigtime_t timeout, bool wakeNext) 198 { 199 locker.Unlock(); 200 201 status_t status = B_OK; 202 203 while (status == B_OK && !atomic_test_and_set(&fCondition, 0, 1)) { 204 status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, 205 timeout); 206 } 207 208 locker.Lock(); 209 if (status == B_OK && wakeNext) 210 Signal(); 211 212 return status; 213 } 214 215 216 void 217 WaitList::Signal() 218 { 219 atomic_or(&fCondition, 1); 220 #ifdef __HAIKU__ 221 release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_IF_WAITING_ONLY); 222 #else 223 int32 count; 224 if (get_sem_count(fSem, &count) == B_OK && count < 0) 225 release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE); 226 #endif 227 } 228 229 230 // #pragma mark - 231 232 233 TCPEndpoint::TCPEndpoint(net_socket *socket) 234 : 235 ProtocolSocket(socket), 236 fManager(NULL), 237 fReceiveList("tcp receive"), 238 fSendList("tcp send"), 239 fOptions(0), 240 fSendWindowShift(0), 241 fReceiveWindowShift(0), 242 fSendUnacknowledged(0), 243 fSendNext(0), 244 fSendMax(0), 245 fSendWindow(0), 246 fSendMaxWindow(0), 247 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 248 fSendQueue(socket->send.buffer_size), 249 fInitialSendSequence(0), 250 fDuplicateAcknowledgeCount(0), 251 fRoute(NULL), 252 fReceiveNext(0), 253 fReceiveMaxAdvertised(0), 254 fReceiveWindow(socket->receive.buffer_size), 255 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 256 fReceiveQueue(socket->receive.buffer_size), 257 fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor), 258 fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor), 259 fRetransmitTimeout(TCP_INITIAL_RTT), 260 fReceivedTimestamp(0), 261 fCongestionWindow(0), 262 fSlowStartThreshold(0), 263 fState(CLOSED), 264 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP) 265 { 266 //gStackModule->init_timer(&fTimer, _TimeWait, this); 267 268 // TODO: to be replaced with a real locking strategy! 269 mutex_init(&fLock, "tcp lock"); 270 271 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this); 272 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, this); 273 gStackModule->init_timer(&fDelayedAcknowledgeTimer, 274 TCPEndpoint::_DelayedAcknowledgeTimer, this); 275 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, this); 276 } 277 278 279 TCPEndpoint::~TCPEndpoint() 280 { 281 mutex_lock(&fLock); 282 283 gStackModule->cancel_timer(&fRetransmitTimer); 284 gStackModule->cancel_timer(&fPersistTimer); 285 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 286 gStackModule->cancel_timer(&fTimeWaitTimer); 287 288 if (fManager) { 289 fManager->Unbind(this); 290 return_endpoint_manager(fManager); 291 } 292 293 mutex_destroy(&fLock); 294 } 295 296 297 status_t 298 TCPEndpoint::InitCheck() const 299 { 300 if (fLock.sem < B_OK) 301 return fLock.sem; 302 303 if (fReceiveList.InitCheck() < B_OK) 304 return fReceiveList.InitCheck(); 305 306 if (fSendList.InitCheck() < B_OK) 307 return fSendList.InitCheck(); 308 309 return B_OK; 310 } 311 312 313 // #pragma mark - protocol API 314 315 316 status_t 317 TCPEndpoint::Open() 318 { 319 TRACE("Open()"); 320 321 status_t status = ProtocolSocket::Open(); 322 if (status < B_OK) 323 return status; 324 325 fManager = create_endpoint_manager(Domain()); 326 if (fManager == NULL) 327 return EAFNOSUPPORT; 328 329 return B_OK; 330 } 331 332 333 status_t 334 TCPEndpoint::Close() 335 { 336 TRACE("Close()"); 337 338 MutexLocker lock(fLock); 339 340 if (fState == LISTEN) 341 delete_sem(fAcceptSemaphore); 342 343 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) { 344 fState = CLOSED; 345 return B_OK; 346 } 347 348 status_t status = _ShutdownEgress(true); 349 if (status != B_OK) 350 return status; 351 352 if (socket->options & SO_LINGER) { 353 TRACE("Close(): Lingering for %i secs", socket->linger); 354 355 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL); 356 357 while (fSendQueue.Used() > 0) { 358 status = fSendList.Wait(lock, maximum); 359 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK) 360 break; 361 else if (status < B_OK) 362 return status; 363 } 364 365 TRACE("Close(): after waiting, the SendQ was left with %lu bytes.", 366 fSendQueue.Used()); 367 } 368 369 // TODO: do i need to wait until fState returns to CLOSED? 370 return B_OK; 371 } 372 373 374 status_t 375 TCPEndpoint::Free() 376 { 377 TRACE("Free()"); 378 379 MutexLocker _(fLock); 380 381 if (fState <= SYNCHRONIZE_SENT || fState == TIME_WAIT) 382 return B_OK; 383 384 // we are only interested in the timer, not in changing state 385 _EnterTimeWait(); 386 return B_BUSY; 387 // we'll be freed later when the 2MSL timer expires 388 } 389 390 391 /*! 392 Creates and sends a synchronize packet to /a address, and then waits 393 until the connection has been established or refused. 394 */ 395 status_t 396 TCPEndpoint::Connect(const sockaddr *address) 397 { 398 TRACE("Connect() on address %s", PrintAddress(address)); 399 400 MutexLocker locker(fLock); 401 402 if (gStackModule->is_restarted_syscall()) { 403 bigtime_t timeout = gStackModule->restore_syscall_restart_timeout(); 404 status_t status = _WaitForEstablished(locker, timeout); 405 TRACE(" Connect(): Connection complete: %s (timeout was %llu)", 406 strerror(status), timeout); 407 return posix_error(status); 408 } 409 410 // Can only call connect() from CLOSED or LISTEN states 411 // otherwise endpoint is considered already connected 412 if (fState == LISTEN) { 413 // this socket is about to connect; remove pending connections in the backlog 414 gSocketModule->set_max_backlog(socket, 0); 415 } else if (fState == ESTABLISHED) { 416 return EISCONN; 417 } else if (fState != CLOSED) 418 return EINPROGRESS; 419 420 status_t status = _PrepareSendPath(address); 421 if (status < B_OK) 422 return status; 423 424 TRACE(" Connect(): starting 3-way handshake..."); 425 426 fState = SYNCHRONIZE_SENT; 427 428 // send SYN 429 status = _SendQueued(); 430 if (status != B_OK) { 431 fState = CLOSED; 432 return status; 433 } 434 435 // If we are running over Loopback, after _SendQueued() returns we 436 // may be in ESTABLISHED already. 437 if (fState == ESTABLISHED) { 438 TRACE(" Connect() completed after _SendQueued()"); 439 return B_OK; 440 } 441 442 // wait until 3-way handshake is complete (if needed) 443 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT); 444 if (timeout == 0) { 445 // we're a non-blocking socket 446 TRACE(" Connect() delayed, return EINPROGRESS"); 447 return EINPROGRESS; 448 } 449 450 bigtime_t absoluteTimeout = absolute_timeout(timeout); 451 gStackModule->store_syscall_restart_timeout(absoluteTimeout); 452 453 status = _WaitForEstablished(locker, absoluteTimeout); 454 TRACE(" Connect(): Connection complete: %s (timeout was %llu)", 455 strerror(status), timeout); 456 return posix_error(status); 457 } 458 459 460 status_t 461 TCPEndpoint::Accept(struct net_socket **_acceptedSocket) 462 { 463 TRACE("Accept()"); 464 465 MutexLocker locker(fLock); 466 467 status_t status; 468 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 469 if (gStackModule->is_restarted_syscall()) 470 timeout = gStackModule->restore_syscall_restart_timeout(); 471 else 472 gStackModule->store_syscall_restart_timeout(timeout); 473 474 do { 475 locker.Unlock(); 476 477 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT 478 | B_CAN_INTERRUPT, timeout); 479 if (status < B_OK) 480 return status; 481 482 locker.Lock(); 483 status = gSocketModule->dequeue_connected(socket, _acceptedSocket); 484 #ifdef TRACE_TCP 485 if (status == B_OK) 486 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol); 487 #endif 488 } while (status < B_OK); 489 490 return status; 491 } 492 493 494 status_t 495 TCPEndpoint::Bind(const sockaddr *address) 496 { 497 if (address == NULL) 498 return B_BAD_VALUE; 499 500 MutexLocker lock(fLock); 501 502 TRACE("Bind() on address %s", PrintAddress(address)); 503 504 if (fState != CLOSED) 505 return EISCONN; 506 507 return fManager->Bind(this, address); 508 } 509 510 511 status_t 512 TCPEndpoint::Unbind(struct sockaddr *address) 513 { 514 TRACE("Unbind()"); 515 516 MutexLocker lock(fLock); 517 return fManager->Unbind(this); 518 } 519 520 521 status_t 522 TCPEndpoint::Listen(int count) 523 { 524 TRACE("Listen()"); 525 526 MutexLocker lock(fLock); 527 528 if (fState != CLOSED) 529 return B_BAD_VALUE; 530 531 fAcceptSemaphore = create_sem(0, "tcp accept"); 532 if (fAcceptSemaphore < B_OK) 533 return ENOBUFS; 534 535 status_t status = fManager->SetPassive(this); 536 if (status < B_OK) { 537 delete_sem(fAcceptSemaphore); 538 fAcceptSemaphore = -1; 539 return status; 540 } 541 542 fState = LISTEN; 543 return B_OK; 544 } 545 546 547 status_t 548 TCPEndpoint::Shutdown(int direction) 549 { 550 TRACE("Shutdown(%i)", direction); 551 552 MutexLocker lock(fLock); 553 554 if (direction == SHUT_RD || direction == SHUT_RDWR) 555 fFlags |= FLAG_NO_RECEIVE; 556 557 if (direction == SHUT_WR || direction == SHUT_RDWR) 558 _ShutdownEgress(false); 559 560 return B_OK; 561 } 562 563 564 /*! 565 Puts data contained in \a buffer into send buffer 566 */ 567 status_t 568 TCPEndpoint::SendData(net_buffer *buffer) 569 { 570 MutexLocker lock(fLock); 571 572 TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]", 573 buffer, buffer->size, buffer->flags, fSendQueue.Size(), 574 fSendQueue.Free()); 575 576 if (fState == CLOSED) 577 return ENOTCONN; 578 if (fState == LISTEN) 579 return EDESTADDRREQ; 580 if (fState == FINISH_SENT || fState == FINISH_ACKNOWLEDGED 581 || fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 582 || fState == TIME_WAIT) { 583 // TODO: send SIGPIPE signal to app? 584 return EPIPE; 585 } 586 587 if (buffer->size > 0) { 588 if (buffer->size > fSendQueue.Size()) 589 return ENOBUFS; 590 591 bigtime_t timeout = absolute_timeout(socket->send.timeout); 592 if (gStackModule->is_restarted_syscall()) 593 timeout = gStackModule->restore_syscall_restart_timeout(); 594 else 595 gStackModule->store_syscall_restart_timeout(timeout); 596 597 while (fSendQueue.Free() < buffer->size) { 598 status_t status = fSendList.Wait(lock, timeout); 599 if (status < B_OK) { 600 TRACE(" SendData() returning %s (%d)", 601 strerror(posix_error(status)), (int)posix_error(status)); 602 return posix_error(status); 603 } 604 } 605 606 fSendQueue.Add(buffer); 607 } 608 609 TRACE(" SendData(): %lu bytes used.", fSendQueue.Used()); 610 611 if (fState == ESTABLISHED || fState == FINISH_RECEIVED) 612 _SendQueued(); 613 614 return B_OK; 615 } 616 617 618 ssize_t 619 TCPEndpoint::SendAvailable() 620 { 621 MutexLocker locker(fLock); 622 623 ssize_t available; 624 625 if (is_writable(fState)) 626 available = fSendQueue.Free(); 627 else 628 available = EPIPE; 629 630 TRACE("SendAvailable(): %li", available); 631 return available; 632 } 633 634 635 status_t 636 TCPEndpoint::FillStat(net_stat *stat) 637 { 638 MutexLocker _(fLock); 639 640 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state)); 641 stat->receive_queue_size = fReceiveQueue.Available(); 642 stat->send_queue_size = fSendQueue.Used(); 643 644 return B_OK; 645 } 646 647 648 status_t 649 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer) 650 { 651 TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags); 652 653 MutexLocker locker(fLock); 654 655 *_buffer = NULL; 656 657 if (fState == CLOSED) 658 return ENOTCONN; 659 660 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 661 if (gStackModule->is_restarted_syscall()) 662 timeout = gStackModule->restore_syscall_restart_timeout(); 663 else 664 gStackModule->store_syscall_restart_timeout(timeout); 665 666 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) { 667 if (flags & MSG_DONTWAIT) 668 return B_WOULD_BLOCK; 669 670 status_t status = _WaitForEstablished(locker, timeout); 671 if (status < B_OK) 672 return posix_error(status); 673 } 674 675 size_t dataNeeded = socket->receive.low_water_mark; 676 677 // When MSG_WAITALL is set then the function should block 678 // until the full amount of data can be returned. 679 if (flags & MSG_WAITALL) 680 dataNeeded = numBytes; 681 682 // TODO: add support for urgent data (MSG_OOB) 683 684 while (true) { 685 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 686 || fState == TIME_WAIT) { 687 // ``Connection closing''. 688 return B_OK; 689 } 690 691 if (fReceiveQueue.Available() > 0) { 692 if (fReceiveQueue.Available() >= dataNeeded 693 || (fReceiveQueue.PushedData() > 0 694 && fReceiveQueue.PushedData() >= fReceiveQueue.Available())) 695 break; 696 } else if (fState == FINISH_RECEIVED) { 697 // ``If no text is awaiting delivery, the RECEIVE will 698 // get a Connection closing''. 699 return B_OK; 700 } 701 702 if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0) 703 return B_WOULD_BLOCK; 704 705 status_t status = fReceiveList.Wait(locker, timeout, false); 706 if (status < B_OK) { 707 // The Open Group base specification mentions that EINTR should be 708 // returned if the recv() is interrupted before _any data_ is 709 // available. So we actually check if there is data, and if so, 710 // push it to the user. 711 if ((status == B_TIMED_OUT || status == B_INTERRUPTED) 712 && fReceiveQueue.Available() > 0) 713 break; 714 715 return posix_error(status); 716 } 717 } 718 719 TRACE(" ReadData(): %lu are available.", fReceiveQueue.Available()); 720 721 if (numBytes < fReceiveQueue.Available()) 722 fReceiveList.Signal(); 723 724 bool clone = (flags & MSG_PEEK); 725 726 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer); 727 728 TRACE(" ReadData(): %lu bytes kept.", fReceiveQueue.Available()); 729 730 // if we are opening the window, check if we should send an ACK 731 if (!clone) 732 SendAcknowledge(false); 733 734 return receivedBytes; 735 } 736 737 738 ssize_t 739 TCPEndpoint::ReadAvailable() 740 { 741 MutexLocker locker(fLock); 742 743 TRACE("ReadAvailable(): %li", _AvailableData()); 744 745 return _AvailableData(); 746 } 747 748 749 status_t 750 TCPEndpoint::SetSendBufferSize(size_t length) 751 { 752 MutexLocker _(fLock); 753 fSendQueue.SetMaxBytes(length); 754 return B_OK; 755 } 756 757 758 status_t 759 TCPEndpoint::SetReceiveBufferSize(size_t length) 760 { 761 MutexLocker _(fLock); 762 fReceiveQueue.SetMaxBytes(length); 763 return B_OK; 764 } 765 766 767 status_t 768 TCPEndpoint::SetOption(int option, const void *_value, int length) 769 { 770 if (option != TCP_NODELAY) 771 return B_BAD_VALUE; 772 773 if (length != sizeof(int)) 774 return B_BAD_VALUE; 775 776 const int *value = (const int *)_value; 777 778 MutexLocker _(fLock); 779 if (*value) 780 fOptions |= TCP_NODELAY; 781 else 782 fOptions &= ~TCP_NODELAY; 783 784 return B_OK; 785 } 786 787 788 // #pragma mark - misc 789 790 791 bool 792 TCPEndpoint::IsBound() const 793 { 794 return !LocalAddress().IsEmpty(true); 795 } 796 797 798 status_t 799 TCPEndpoint::DelayedAcknowledge() 800 { 801 if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) { 802 // timer was active, send an ACK now (with the exception above, 803 // we send every other ACK) 804 return SendAcknowledge(true); 805 } 806 807 gStackModule->set_timer(&fDelayedAcknowledgeTimer, 808 TCP_DELAYED_ACKNOWLEDGE_TIMEOUT); 809 return B_OK; 810 } 811 812 813 status_t 814 TCPEndpoint::SendAcknowledge(bool force) 815 { 816 return _SendQueued(force, 0); 817 } 818 819 820 void 821 TCPEndpoint::_StartPersistTimer() 822 { 823 gStackModule->set_timer(&fPersistTimer, 1000000LL); 824 } 825 826 827 void 828 TCPEndpoint::_EnterTimeWait() 829 { 830 TRACE("_EnterTimeWait()\n"); 831 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1); 832 } 833 834 835 status_t 836 TCPEndpoint::UpdateTimeWait() 837 { 838 return B_OK; 839 } 840 841 842 // #pragma mark - receive 843 844 845 int32 846 TCPEndpoint::_ListenReceive(tcp_segment_header &segment, net_buffer *buffer) 847 { 848 TRACE("ListenReceive()"); 849 850 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state, 851 // but the error behaviour differs 852 if (segment.flags & TCP_FLAG_RESET) 853 return DROP; 854 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 855 return DROP | RESET; 856 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 857 return DROP; 858 859 // TODO: drop broadcast/multicast 860 861 // spawn new endpoint for accept() 862 net_socket *newSocket; 863 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) 864 return DROP; 865 866 return ((TCPEndpoint *)newSocket->first_protocol)->Spawn(this, 867 segment, buffer); 868 } 869 870 871 int32 872 TCPEndpoint::Spawn(TCPEndpoint *parent, tcp_segment_header &segment, 873 net_buffer *buffer) 874 { 875 MutexLocker _(fLock); 876 877 // TODO error checking 878 ProtocolSocket::Open(); 879 880 fState = SYNCHRONIZE_RECEIVED; 881 fManager = parent->fManager; 882 883 LocalAddress().SetTo(buffer->destination); 884 PeerAddress().SetTo(buffer->source); 885 886 TRACE("Spawn()"); 887 888 // TODO: proper error handling! 889 if (fManager->BindChild(this) < B_OK) 890 return DROP; 891 892 if (_PrepareSendPath(*PeerAddress()) < B_OK) 893 return DROP; 894 895 fOptions = parent->fOptions; 896 fAcceptSemaphore = parent->fAcceptSemaphore; 897 898 _PrepareReceivePath(segment); 899 900 // send SYN+ACK 901 if (_SendQueued() < B_OK) 902 return DROP; 903 904 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 905 // we handled this flag now, it must not be set for further processing 906 907 return _Receive(segment, buffer); 908 } 909 910 911 void 912 TCPEndpoint::DumpInternalState() const 913 { 914 kprintf("Lock: { sem: %ld, holder: %ld }\n", fLock.sem, fLock.holder); 915 kprintf("AcceptSem: %ld\n", fAcceptSemaphore); 916 kprintf("Options: 0x%lx\n", (uint32)fOptions); 917 kprintf("SendWindowShift: %lu\n", (uint32)fSendWindowShift); 918 kprintf("ReceiveWindowShift: %lu\n", (uint32)fReceiveWindowShift); 919 kprintf("SendUnacknowledged: %lu\n", (uint32)fSendUnacknowledged); 920 kprintf("SendNext: %lu\n", (uint32)fSendNext); 921 kprintf("SendMax: %lu\n", (uint32)fSendMax); 922 kprintf("SendWindow: %lu\n", fSendWindow); 923 kprintf("SendMaxWindow: %lu\n", fSendMaxWindow); 924 kprintf("SendMaxSegmentSize: %lu\n", fSendMaxSegmentSize); 925 kprintf("Send-Q: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size()); 926 kprintf("LastAcknowledgeSent: %lu\n", (uint32)fLastAcknowledgeSent); 927 kprintf("InitialSendSequence: %lu\n", (uint32)fInitialSendSequence); 928 kprintf("DuplicateAcknowledgeCount: %lu\n", fDuplicateAcknowledgeCount); 929 kprintf("ReceiveNext: %lu\n", (uint32)fReceiveNext); 930 kprintf("ReceiveMaxAdvertised: %lu\n", (uint32)fReceiveMaxAdvertised); 931 kprintf("ReceiveWindow: %lu\n", (uint32)fReceiveWindow); 932 kprintf("ReceiveMaxSegmentSize: %lu\n", (uint32)fReceiveMaxSegmentSize); 933 kprintf("Recv-Q: %lu / %lu\n", fReceiveQueue.Available(), 934 fReceiveQueue.Size()); 935 kprintf("InitialReceiveSequence: %lu\n", (uint32)fInitialReceiveSequence); 936 kprintf("RoundTripTime: %ld (dev %ld)\n", fRoundTripTime, 937 fRoundTripDeviation); 938 kprintf("RetransmitTimeout: %llu\n", (uint64)fRetransmitTimeout); 939 kprintf("CongestionWindow: %lu\n", fCongestionWindow); 940 kprintf("SlowStartThreshold: %lu\n", fSlowStartThreshold); 941 kprintf("State: %s\n", name_for_state(fState)); 942 kprintf("Flags: 0x%lx\n", fFlags); 943 } 944 945 946 void 947 TCPEndpoint::_HandleReset(status_t error) 948 { 949 gStackModule->cancel_timer(&fRetransmitTimer); 950 951 socket->error = error; 952 fState = CLOSED; 953 954 fSendList.Signal(); 955 _NotifyReader(); 956 957 gSocketModule->notify(socket, B_SELECT_WRITE, error); 958 gSocketModule->notify(socket, B_SELECT_ERROR, error); 959 } 960 961 962 int32 963 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, 964 net_buffer *buffer) 965 { 966 TRACE("_SynchronizeSentReceive()"); 967 968 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 969 && (fInitialSendSequence >= segment.acknowledge 970 || fSendMax < segment.acknowledge)) 971 return DROP | RESET; 972 973 if (segment.flags & TCP_FLAG_RESET) { 974 _HandleReset(ECONNREFUSED); 975 return DROP; 976 } 977 978 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 979 return DROP; 980 981 fSendUnacknowledged = segment.acknowledge; 982 _PrepareReceivePath(segment); 983 984 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 985 _MarkEstablished(); 986 } else { 987 // simultaneous open 988 fState = SYNCHRONIZE_RECEIVED; 989 } 990 991 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 992 // we handled this flag now, it must not be set for further processing 993 994 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE; 995 } 996 997 998 int32 999 TCPEndpoint::SegmentReceived(tcp_segment_header &segment, net_buffer *buffer) 1000 { 1001 MutexLocker locker(fLock); 1002 1003 TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s", 1004 buffer, buffer->size, PrintAddress(buffer->source), 1005 PrintAddress(buffer->destination)); 1006 TRACE(" flags 0x%x, seq %lu, ack %lu, wnd %lu", 1007 segment.flags, segment.sequence, segment.acknowledge, 1008 (uint32)segment.advertised_window << fSendWindowShift); 1009 1010 int32 segmentAction = DROP; 1011 1012 switch (fState) { 1013 case LISTEN: 1014 segmentAction = _ListenReceive(segment, buffer); 1015 break; 1016 1017 case SYNCHRONIZE_SENT: 1018 segmentAction = _SynchronizeSentReceive(segment, buffer); 1019 break; 1020 1021 case SYNCHRONIZE_RECEIVED: 1022 case ESTABLISHED: 1023 case FINISH_RECEIVED: 1024 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1025 case FINISH_SENT: 1026 case FINISH_ACKNOWLEDGED: 1027 case CLOSING: 1028 case TIME_WAIT: 1029 case CLOSED: 1030 segmentAction = _SegmentReceived(segment, buffer); 1031 break; 1032 } 1033 1034 // process acknowledge action as asked for by the *Receive() method 1035 if (segmentAction & IMMEDIATE_ACKNOWLEDGE) 1036 SendAcknowledge(true); 1037 else if (segmentAction & ACKNOWLEDGE) 1038 DelayedAcknowledge(); 1039 1040 return segmentAction; 1041 } 1042 1043 int32 1044 TCPEndpoint::_SegmentReceived(tcp_segment_header &segment, net_buffer *buffer) 1045 { 1046 uint32 advertisedWindow = (uint32)segment.advertised_window 1047 << fSendWindowShift; 1048 1049 // First, handle the most common case for uni-directional data transfer 1050 // (known as header prediction - the segment must not change the window, 1051 // and must be the expected sequence, and contain no control flags) 1052 1053 if (fState == ESTABLISHED 1054 && segment.AcknowledgeOnly() 1055 && fReceiveNext == segment.sequence 1056 && advertisedWindow > 0 && advertisedWindow == fSendWindow 1057 && fSendNext == fSendMax) { 1058 _UpdateTimestamps(segment, buffer->size); 1059 1060 if (buffer->size == 0) { 1061 // this is a pure acknowledge segment - we're on the sending end 1062 if (fSendUnacknowledged < segment.acknowledge 1063 && fSendMax >= segment.acknowledge) { 1064 _Acknowledged(segment); 1065 return DROP; 1066 } 1067 } else if (segment.acknowledge == fSendUnacknowledged 1068 && fReceiveQueue.IsContiguous() 1069 && fReceiveQueue.Free() >= buffer->size 1070 && !(fFlags & FLAG_NO_RECEIVE)) { 1071 if (_AddData(segment, buffer)) 1072 _NotifyReader(); 1073 1074 return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0 1075 ? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE); 1076 } 1077 } 1078 1079 // The fast path was not applicable, so we continue with the standard 1080 // processing of the incoming segment 1081 1082 if (fState != SYNCHRONIZE_SENT && fState != LISTEN && fState != CLOSED) { 1083 // 1. check sequence number 1084 if (!segment_in_sequence(segment, buffer->size, fReceiveNext, 1085 fReceiveWindow)) { 1086 TRACE(" Receive(): segment out of window, next: %lu wnd: %lu", 1087 (uint32)fReceiveNext, fReceiveWindow); 1088 if (segment.flags & TCP_FLAG_RESET) { 1089 // TODO: this doesn't look right - review! 1090 return DROP; 1091 } 1092 return DROP | IMMEDIATE_ACKNOWLEDGE; 1093 } 1094 } 1095 1096 return _Receive(segment, buffer); 1097 } 1098 1099 1100 // #pragma mark - send 1101 1102 1103 inline uint8 1104 TCPEndpoint::_CurrentFlags() 1105 { 1106 // we don't set FLAG_FINISH here, instead we do it 1107 // conditionally below depending if we are sending 1108 // the last bytes of the send queue. 1109 1110 switch (fState) { 1111 case CLOSED: 1112 return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE; 1113 1114 case SYNCHRONIZE_SENT: 1115 return TCP_FLAG_SYNCHRONIZE; 1116 case SYNCHRONIZE_RECEIVED: 1117 return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE; 1118 1119 case ESTABLISHED: 1120 case FINISH_RECEIVED: 1121 case FINISH_ACKNOWLEDGED: 1122 case TIME_WAIT: 1123 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1124 case FINISH_SENT: 1125 case CLOSING: 1126 return TCP_FLAG_ACKNOWLEDGE; 1127 1128 default: 1129 return 0; 1130 } 1131 } 1132 1133 1134 inline bool 1135 TCPEndpoint::_ShouldSendSegment(tcp_segment_header &segment, uint32 length, 1136 uint32 segmentMaxSize, uint32 flightSize) 1137 { 1138 if (length > 0) { 1139 // Avoid the silly window syndrome - we only send a segment in case: 1140 // - we have a full segment to send, or 1141 // - we're at the end of our buffer queue, or 1142 // - the buffer is at least larger than half of the maximum send window, or 1143 // - we're retransmitting data 1144 if (length == segmentMaxSize 1145 || (fOptions & TCP_NODELAY) != 0 1146 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence() 1147 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2)) 1148 return true; 1149 } 1150 1151 // check if we need to send a window update to the peer 1152 if (segment.advertised_window > 0) { 1153 // correct the window to take into account what already has been advertised 1154 uint32 window = (segment.advertised_window << fReceiveWindowShift) 1155 - (fReceiveMaxAdvertised - fReceiveNext); 1156 1157 // if we can advertise a window larger than twice the maximum segment 1158 // size, or half the maximum buffer size we send a window update 1159 if (window >= (fReceiveMaxSegmentSize << 1) 1160 || window >= (socket->receive.buffer_size >> 1)) 1161 return true; 1162 } 1163 1164 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH 1165 | TCP_FLAG_RESET)) != 0) 1166 return true; 1167 1168 // there is no reason to send a segment just now 1169 return false; 1170 } 1171 1172 1173 status_t 1174 TCPEndpoint::_SendQueued(bool force) 1175 { 1176 return _SendQueued(force, fSendWindow); 1177 } 1178 1179 1180 /*! 1181 Sends one or more TCP segments with the data waiting in the queue, or some 1182 specific flags that need to be sent. 1183 */ 1184 status_t 1185 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow) 1186 { 1187 if (fRoute == NULL) 1188 return B_ERROR; 1189 1190 // in passive state? 1191 if (fState == LISTEN) 1192 return B_ERROR; 1193 1194 tcp_segment_header segment(_CurrentFlags()); 1195 1196 if ((fOptions & TCP_NOOPT) == 0) { 1197 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) { 1198 segment.options |= TCP_HAS_TIMESTAMPS; 1199 segment.timestamp_reply = fReceivedTimestamp; 1200 segment.timestamp_value = tcp_now(); 1201 } 1202 1203 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1204 && fSendNext == fInitialSendSequence) { 1205 // add connection establishment options 1206 segment.max_segment_size = fReceiveMaxSegmentSize; 1207 if (fFlags & FLAG_OPTION_WINDOW_SCALE) { 1208 segment.options |= TCP_HAS_WINDOW_SCALE; 1209 segment.window_shift = fReceiveWindowShift; 1210 } 1211 } 1212 } 1213 1214 size_t availableBytes = fReceiveQueue.Free(); 1215 if (fFlags & FLAG_OPTION_WINDOW_SCALE) 1216 segment.advertised_window = availableBytes >> fReceiveWindowShift; 1217 else 1218 segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes); 1219 1220 segment.acknowledge = fReceiveNext; 1221 segment.urgent_offset = 0; 1222 1223 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow) 1224 sendWindow = fCongestionWindow; 1225 1226 // SND.UNA SND.NXT SND.MAX 1227 // | | | 1228 // v v v 1229 // ----------------------------------- 1230 // | effective window | 1231 // ----------------------------------- 1232 1233 // Flight size represents the window of data which is currently in the 1234 // ether. We should never send data such as the flight size becomes larger 1235 // than the effective window. Note however that the effective window may be 1236 // reduced (by congestion for instance), so at some point in time flight 1237 // size may be larger than the currently calculated window. 1238 1239 uint32 flightSize = fSendMax - fSendUnacknowledged; 1240 uint32 consumedWindow = fSendNext - fSendUnacknowledged; 1241 1242 if (consumedWindow > sendWindow) { 1243 sendWindow = 0; 1244 // TODO enter persist state? try to get a window update. 1245 } else 1246 sendWindow -= consumedWindow; 1247 1248 if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) { 1249 // send one byte of data to ask for a window update 1250 // (triggered by the persist timer) 1251 sendWindow = 1; 1252 } 1253 1254 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow); 1255 tcp_sequence previousSendNext = fSendNext; 1256 1257 do { 1258 uint32 segmentMaxSize = fSendMaxSegmentSize 1259 - tcp_options_length(segment); 1260 uint32 segmentLength = min_c(length, segmentMaxSize); 1261 1262 if (fSendNext + segmentLength == fSendQueue.LastSequence()) { 1263 if (state_needs_finish(fState)) 1264 segment.flags |= TCP_FLAG_FINISH; 1265 if (length > 0) 1266 segment.flags |= TCP_FLAG_PUSH; 1267 } 1268 1269 // Determine if we should really send this segment 1270 if (!force && !_ShouldSendSegment(segment, segmentLength, 1271 segmentMaxSize, flightSize)) { 1272 if (fSendQueue.Available() 1273 && !gStackModule->is_timer_active(&fPersistTimer) 1274 && !gStackModule->is_timer_active(&fRetransmitTimer)) 1275 _StartPersistTimer(); 1276 break; 1277 } 1278 1279 net_buffer *buffer = gBufferModule->create(256); 1280 if (buffer == NULL) 1281 return B_NO_MEMORY; 1282 1283 status_t status = B_OK; 1284 if (segmentLength > 0) 1285 status = fSendQueue.Get(buffer, fSendNext, segmentLength); 1286 if (status < B_OK) { 1287 gBufferModule->free(buffer); 1288 return status; 1289 } 1290 1291 LocalAddress().CopyTo(buffer->source); 1292 PeerAddress().CopyTo(buffer->destination); 1293 1294 uint32 size = buffer->size; 1295 segment.sequence = fSendNext; 1296 1297 TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s", 1298 buffer, buffer->size, PrintAddress(buffer->source), 1299 PrintAddress(buffer->destination)); 1300 TRACE(" flags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu" 1301 ", ssthresh %lu", segment.flags, segment.sequence, 1302 segment.acknowledge, segment.advertised_window, 1303 fCongestionWindow, fSlowStartThreshold); 1304 TRACE(" len %lu first %lu last %lu", segmentLength, 1305 (uint32)fSendQueue.FirstSequence(), 1306 (uint32)fSendQueue.LastSequence()); 1307 1308 PROBE(buffer, sendWindow); 1309 sendWindow -= buffer->size; 1310 1311 status = add_tcp_header(AddressModule(), segment, buffer); 1312 if (status != B_OK) { 1313 gBufferModule->free(buffer); 1314 return status; 1315 } 1316 1317 // Update send status - we need to do this before we send the data 1318 // for local connections as the answer is directly handled 1319 1320 if (segment.flags & TCP_FLAG_SYNCHRONIZE) { 1321 segment.options &= ~TCP_HAS_WINDOW_SCALE; 1322 segment.max_segment_size = 0; 1323 size++; 1324 } 1325 1326 if (segment.flags & TCP_FLAG_FINISH) 1327 size++; 1328 1329 uint32 sendMax = fSendMax; 1330 fSendNext += size; 1331 if (fSendMax < fSendNext) 1332 fSendMax = fSendNext; 1333 1334 fReceiveMaxAdvertised = fReceiveNext 1335 + ((uint32)segment.advertised_window << fReceiveWindowShift); 1336 1337 status = next->module->send_routed_data(next, fRoute, buffer); 1338 if (status < B_OK) { 1339 gBufferModule->free(buffer); 1340 1341 fSendNext = segment.sequence; 1342 fSendMax = sendMax; 1343 // restore send status 1344 return status; 1345 } 1346 1347 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 1348 fLastAcknowledgeSent = segment.acknowledge; 1349 1350 length -= segmentLength; 1351 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET 1352 | TCP_FLAG_FINISH); 1353 } while (length > 0); 1354 1355 // if we sent data from the beggining of the send queue, 1356 // start the retransmition timer 1357 if (previousSendNext == fSendUnacknowledged 1358 && fSendNext > previousSendNext) { 1359 TRACE(" SendQueue(): set retransmit timer with rto %llu", 1360 fRetransmitTimeout); 1361 1362 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 1363 } 1364 1365 return B_OK; 1366 } 1367 1368 1369 int 1370 TCPEndpoint::_MaxSegmentSize(const sockaddr *address) const 1371 { 1372 return next->module->get_mtu(next, address) - sizeof(tcp_header); 1373 } 1374 1375 1376 status_t 1377 TCPEndpoint::_ShutdownEgress(bool closing) 1378 { 1379 tcp_state previousState = fState; 1380 1381 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED) 1382 fState = FINISH_SENT; 1383 else if (fState == FINISH_RECEIVED) 1384 fState = WAIT_FOR_FINISH_ACKNOWLEDGE; 1385 else 1386 return B_OK; 1387 1388 status_t status = _SendQueued(); 1389 if (status != B_OK) { 1390 fState = previousState; 1391 return status; 1392 } 1393 1394 return B_OK; 1395 } 1396 1397 1398 ssize_t 1399 TCPEndpoint::_AvailableData() const 1400 { 1401 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding 1402 // the application of FLAG_NO_RECEIVE in listen()ing 1403 // sockets. 1404 if (fState == LISTEN) 1405 return gSocketModule->count_connected(socket); 1406 if (fState == SYNCHRONIZE_SENT) 1407 return 0; 1408 1409 ssize_t availableData = fReceiveQueue.Available(); 1410 1411 if (availableData == 0 && !_ShouldReceive()) 1412 return ENOTCONN; 1413 1414 return availableData; 1415 } 1416 1417 1418 void 1419 TCPEndpoint::_NotifyReader() 1420 { 1421 fReceiveList.Signal(); 1422 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData()); 1423 } 1424 1425 1426 bool 1427 TCPEndpoint::_ShouldReceive() const 1428 { 1429 if (fFlags & FLAG_NO_RECEIVE) 1430 return false; 1431 1432 return fState == ESTABLISHED || fState == FINISH_SENT 1433 || fState == FINISH_ACKNOWLEDGED; 1434 } 1435 1436 1437 int32 1438 TCPEndpoint::_Receive(tcp_segment_header &segment, net_buffer *buffer) 1439 { 1440 uint32 advertisedWindow = (uint32)segment.advertised_window 1441 << fSendWindowShift; 1442 1443 size_t segmentLength = buffer->size; 1444 1445 if (segment.flags & TCP_FLAG_RESET) { 1446 // is this a valid reset? 1447 if (fLastAcknowledgeSent <= segment.sequence 1448 && tcp_sequence(segment.sequence) < (fLastAcknowledgeSent 1449 + fReceiveWindow)) { 1450 status_t error; 1451 if (fState == SYNCHRONIZE_RECEIVED) 1452 error = ECONNREFUSED; 1453 else if (fState == CLOSING || fState == TIME_WAIT 1454 || fState == WAIT_FOR_FINISH_ACKNOWLEDGE) 1455 error = ENOTCONN; 1456 else 1457 error = ECONNRESET; 1458 1459 _HandleReset(error); 1460 } 1461 1462 return DROP; 1463 } 1464 1465 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1466 || (fState == SYNCHRONIZE_RECEIVED 1467 && (fInitialReceiveSequence > segment.sequence 1468 || (segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1469 && (fSendUnacknowledged > segment.acknowledge 1470 || fSendMax < segment.acknowledge)))) { 1471 // reset the connection - either the initial SYN was faulty, or we 1472 // received a SYN within the data stream 1473 return DROP | RESET; 1474 } 1475 1476 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow); 1477 // the window must not shrink 1478 1479 // trim buffer to be within the receive window 1480 int32 drop = fReceiveNext - segment.sequence; 1481 if (drop > 0) { 1482 if ((uint32)drop > buffer->size 1483 || ((uint32)drop == buffer->size 1484 && (segment.flags & TCP_FLAG_FINISH) == 0)) { 1485 // don't accidently remove a FIN we shouldn't remove 1486 segment.flags &= ~TCP_FLAG_FINISH; 1487 drop = buffer->size; 1488 } 1489 1490 // remove duplicate data at the start 1491 TRACE("* remove %ld bytes from the start", drop); 1492 gBufferModule->remove_header(buffer, drop); 1493 segment.sequence += drop; 1494 } 1495 1496 int32 action = KEEP; 1497 1498 drop = segment.sequence + buffer->size - (fReceiveNext + fReceiveWindow); 1499 if (drop > 0) { 1500 // remove data exceeding our window 1501 if ((uint32)drop >= buffer->size) { 1502 // if we can accept data, or the segment is not what we'd expect, 1503 // drop the segment (an immediate acknowledge is always triggered) 1504 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext) 1505 return DROP | IMMEDIATE_ACKNOWLEDGE; 1506 1507 action |= IMMEDIATE_ACKNOWLEDGE; 1508 } 1509 1510 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1511 // we need to remove the finish, too, as part of the data 1512 drop--; 1513 } 1514 1515 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH); 1516 TRACE("* remove %ld bytes from the end", drop); 1517 gBufferModule->remove_trailer(buffer, drop); 1518 } 1519 1520 #ifdef TRACE_TCP 1521 if (advertisedWindow > fSendWindow) { 1522 TRACE(" Receive(): Window update %lu -> %lu", fSendWindow, 1523 advertisedWindow); 1524 } 1525 #endif 1526 1527 fSendWindow = advertisedWindow; 1528 if (advertisedWindow > fSendMaxWindow) 1529 fSendMaxWindow = advertisedWindow; 1530 1531 // Then look at the acknowledgement for any updates 1532 1533 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) { 1534 // process acknowledged data 1535 if (fState == SYNCHRONIZE_RECEIVED) 1536 _MarkEstablished(); 1537 1538 if (fSendMax < segment.acknowledge || fState == TIME_WAIT) 1539 return DROP | IMMEDIATE_ACKNOWLEDGE; 1540 1541 if (segment.acknowledge < fSendUnacknowledged) { 1542 if (buffer->size == 0 && advertisedWindow == fSendWindow 1543 && (segment.flags & TCP_FLAG_FINISH) == 0) { 1544 TRACE("Receive(): duplicate ack!"); 1545 1546 _DuplicateAcknowledge(segment); 1547 } 1548 1549 return DROP; 1550 } else { 1551 // this segment acknowledges in flight data 1552 1553 if (fDuplicateAcknowledgeCount >= 3) { 1554 // deflate the window. 1555 fCongestionWindow = fSlowStartThreshold; 1556 } 1557 1558 fDuplicateAcknowledgeCount = 0; 1559 1560 if (fSendMax == segment.acknowledge) 1561 TRACE("Receive(): all inflight data ack'd!"); 1562 1563 if (segment.acknowledge > fSendQueue.LastSequence() 1564 && fState > ESTABLISHED) { 1565 TRACE("Receive(): FIN has been acknowledged!"); 1566 1567 switch (fState) { 1568 case FINISH_SENT: 1569 fState = FINISH_ACKNOWLEDGED; 1570 break; 1571 case CLOSING: 1572 fState = TIME_WAIT; 1573 _EnterTimeWait(); 1574 return DROP; 1575 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1576 fState = CLOSED; 1577 break; 1578 1579 default: 1580 break; 1581 } 1582 } 1583 1584 if (fState != CLOSED) 1585 _Acknowledged(segment); 1586 } 1587 } 1588 1589 if (segment.flags & TCP_FLAG_URGENT) { 1590 if (fState == ESTABLISHED || fState == FINISH_SENT 1591 || fState == FINISH_ACKNOWLEDGED) { 1592 // TODO: Handle urgent data: 1593 // - RCV.UP <- max(RCV.UP, SEG.UP) 1594 // - signal the user that urgent data is available (SIGURG) 1595 } 1596 } 1597 1598 bool notify = false; 1599 1600 if (buffer->size > 0 && _ShouldReceive()) 1601 notify = _AddData(segment, buffer); 1602 else 1603 action = (action & ~KEEP) | DROP; 1604 1605 if (segment.flags & TCP_FLAG_FINISH) { 1606 segmentLength++; 1607 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) { 1608 TRACE("Receive(): peer is finishing connection!"); 1609 fReceiveNext++; 1610 notify = true; 1611 1612 // FIN implies PSH 1613 fReceiveQueue.SetPushPointer(); 1614 1615 // we'll reply immediatly to the FIN if we are not 1616 // transitioning to TIME WAIT so we immediatly ACK it. 1617 action |= IMMEDIATE_ACKNOWLEDGE; 1618 1619 // other side is closing connection; change states 1620 switch (fState) { 1621 case ESTABLISHED: 1622 case SYNCHRONIZE_RECEIVED: 1623 fState = FINISH_RECEIVED; 1624 break; 1625 case FINISH_SENT: 1626 // simultaneous close 1627 fState = CLOSING; 1628 break; 1629 case FINISH_ACKNOWLEDGED: 1630 fState = TIME_WAIT; 1631 _EnterTimeWait(); 1632 break; 1633 default: 1634 break; 1635 } 1636 } 1637 } 1638 1639 if (notify) 1640 _NotifyReader(); 1641 1642 if (buffer->size > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0) 1643 action |= ACKNOWLEDGE; 1644 1645 _UpdateTimestamps(segment, segmentLength); 1646 1647 TRACE("Receive() Action %ld", action); 1648 1649 return action; 1650 } 1651 1652 1653 void 1654 TCPEndpoint::_UpdateTimestamps(tcp_segment_header &segment, 1655 size_t segmentLength) 1656 { 1657 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1658 tcp_sequence sequence(segment.sequence); 1659 1660 if (fLastAcknowledgeSent >= sequence 1661 && fLastAcknowledgeSent < (sequence + segmentLength)) 1662 fReceivedTimestamp = segment.timestamp_value; 1663 } 1664 } 1665 1666 1667 void 1668 TCPEndpoint::_MarkEstablished() 1669 { 1670 fState = ESTABLISHED; 1671 1672 if (socket->parent != NULL) { 1673 gSocketModule->set_connected(socket); 1674 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE); 1675 } 1676 1677 fSendList.Signal(); 1678 } 1679 1680 1681 status_t 1682 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout) 1683 { 1684 while (fState != ESTABLISHED) { 1685 if (socket->error != B_OK) 1686 return socket->error; 1687 1688 status_t status = fSendList.Wait(locker, timeout); 1689 if (status < B_OK) 1690 return status; 1691 } 1692 1693 return B_OK; 1694 } 1695 1696 1697 bool 1698 TCPEndpoint::_AddData(tcp_segment_header &segment, net_buffer *buffer) 1699 { 1700 fReceiveQueue.Add(buffer, segment.sequence); 1701 fReceiveNext = fReceiveQueue.NextSequence(); 1702 1703 TRACE(" _AddData(): adding data, receive next = %lu. Now have %lu bytes.", 1704 (uint32)fReceiveNext, fReceiveQueue.Available()); 1705 1706 if (segment.flags & TCP_FLAG_PUSH) 1707 fReceiveQueue.SetPushPointer(); 1708 1709 return fReceiveQueue.Available() > 0; 1710 } 1711 1712 1713 void 1714 TCPEndpoint::_PrepareReceivePath(tcp_segment_header &segment) 1715 { 1716 fInitialReceiveSequence = segment.sequence; 1717 1718 // count the received SYN 1719 segment.sequence++; 1720 1721 fReceiveNext = segment.sequence; 1722 fReceiveQueue.SetInitialSequence(segment.sequence); 1723 1724 if ((fOptions & TCP_NOOPT) == 0) { 1725 if (segment.max_segment_size > 0) 1726 fSendMaxSegmentSize = segment.max_segment_size; 1727 1728 if (segment.options & TCP_HAS_WINDOW_SCALE) { 1729 fFlags |= FLAG_OPTION_WINDOW_SCALE; 1730 fSendWindowShift = segment.window_shift; 1731 } else { 1732 fFlags &= ~FLAG_OPTION_WINDOW_SCALE; 1733 fReceiveWindowShift = 0; 1734 } 1735 1736 if (segment.options & TCP_HAS_TIMESTAMPS) { 1737 fFlags |= FLAG_OPTION_TIMESTAMP; 1738 fReceivedTimestamp = segment.timestamp_value; 1739 } else 1740 fFlags &= ~FLAG_OPTION_TIMESTAMP; 1741 } 1742 1743 fCongestionWindow = 2 * fSendMaxSegmentSize; 1744 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift; 1745 } 1746 1747 1748 status_t 1749 TCPEndpoint::_PrepareSendPath(const sockaddr *peer) 1750 { 1751 if (fRoute == NULL) { 1752 fRoute = gDatalinkModule->get_route(Domain(), peer); 1753 if (fRoute == NULL) 1754 return ENETUNREACH; 1755 } 1756 1757 // make sure connection does not already exist 1758 status_t status = fManager->SetConnection(this, *LocalAddress(), peer, 1759 fRoute->interface->address); 1760 if (status < B_OK) 1761 return status; 1762 1763 fInitialSendSequence = system_time() >> 4; 1764 fSendNext = fInitialSendSequence; 1765 fSendUnacknowledged = fInitialSendSequence; 1766 fSendMax = fInitialSendSequence; 1767 1768 // we are counting the SYN here 1769 fSendQueue.SetInitialSequence(fSendNext + 1); 1770 1771 fReceiveMaxSegmentSize = _MaxSegmentSize(peer); 1772 1773 // Compute the window shift we advertise to our peer - if it doesn't support 1774 // this option, this will be reset to 0 (when its SYN is received) 1775 fReceiveWindowShift = 0; 1776 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT 1777 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) { 1778 fReceiveWindowShift++; 1779 } 1780 1781 return B_OK; 1782 } 1783 1784 1785 void 1786 TCPEndpoint::_Acknowledged(tcp_segment_header &segment) 1787 { 1788 size_t previouslyUsed = fSendQueue.Used(); 1789 1790 fSendQueue.RemoveUntil(segment.acknowledge); 1791 fSendUnacknowledged = segment.acknowledge; 1792 1793 if (fSendNext < fSendUnacknowledged) 1794 fSendNext = fSendUnacknowledged; 1795 1796 if (fSendUnacknowledged == fSendMax) 1797 gStackModule->cancel_timer(&fRetransmitTimer); 1798 1799 if (fSendQueue.Used() < previouslyUsed) { 1800 // this ACK acknowledged data 1801 1802 if (segment.options & TCP_HAS_TIMESTAMPS) 1803 _UpdateSRTT(tcp_diff_timestamp(segment.timestamp_reply)); 1804 else { 1805 // TODO: Fallback to RFC 793 type estimation 1806 } 1807 1808 if (is_writable(fState)) { 1809 // notify threads waiting on the socket to become writable again 1810 fSendList.Signal(); 1811 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used()); 1812 } 1813 1814 if (fCongestionWindow < fSlowStartThreshold) 1815 fCongestionWindow += fSendMaxSegmentSize; 1816 } 1817 1818 if (fCongestionWindow >= fSlowStartThreshold) { 1819 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize; 1820 1821 if (increment < fCongestionWindow) 1822 increment = 1; 1823 else 1824 increment /= fCongestionWindow; 1825 1826 fCongestionWindow += increment; 1827 } 1828 1829 // if there is data left to be send, send it now 1830 if (fSendQueue.Used() > 0) 1831 _SendQueued(); 1832 } 1833 1834 1835 void 1836 TCPEndpoint::_Retransmit() 1837 { 1838 TRACE("Retransmit()"); 1839 _ResetSlowStart(); 1840 fSendNext = fSendUnacknowledged; 1841 _SendQueued(); 1842 } 1843 1844 1845 void 1846 TCPEndpoint::_UpdateSRTT(int32 roundTripTime) 1847 { 1848 int32 rtt = roundTripTime; 1849 1850 // Update_SRTT() as per Van Jacobson 1851 rtt -= (fRoundTripTime / 8); 1852 fRoundTripTime += rtt; 1853 if (rtt < 0) 1854 rtt = -rtt; 1855 rtt -= (fRoundTripDeviation / 4); 1856 fRoundTripDeviation += rtt; 1857 1858 fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2) 1859 * kTimestampFactor; 1860 1861 TRACE(" RTO is now %llu (after rtt %ldms)", fRetransmitTimeout, 1862 roundTripTime); 1863 } 1864 1865 1866 void 1867 TCPEndpoint::_ResetSlowStart() 1868 { 1869 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged) / 2, 1870 2 * fSendMaxSegmentSize); 1871 fCongestionWindow = fSendMaxSegmentSize; 1872 } 1873 1874 1875 void 1876 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment) 1877 { 1878 if (++fDuplicateAcknowledgeCount < 3) 1879 return; 1880 1881 if (fDuplicateAcknowledgeCount == 3) { 1882 _ResetSlowStart(); 1883 fCongestionWindow = fSlowStartThreshold + 3 1884 * fSendMaxSegmentSize; 1885 fSendNext = segment.acknowledge; 1886 } else if (fDuplicateAcknowledgeCount > 3) 1887 fCongestionWindow += fSendMaxSegmentSize; 1888 1889 _SendQueued(); 1890 } 1891 1892 1893 // #pragma mark - timer 1894 1895 1896 /*static*/ void 1897 TCPEndpoint::_RetransmitTimer(net_timer *timer, void *data) 1898 { 1899 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1900 1901 MutexLocker locker(endpoint->fLock); 1902 if (!locker.IsLocked()) 1903 return; 1904 1905 endpoint->_Retransmit(); 1906 } 1907 1908 1909 /*static*/ void 1910 TCPEndpoint::_PersistTimer(net_timer *timer, void *data) 1911 { 1912 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1913 1914 MutexLocker locker(endpoint->fLock); 1915 if (!locker.IsLocked()) 1916 return; 1917 1918 endpoint->_SendQueued(true); 1919 } 1920 1921 1922 /*static*/ void 1923 TCPEndpoint::_DelayedAcknowledgeTimer(struct net_timer *timer, void *data) 1924 { 1925 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1926 1927 MutexLocker locker(endpoint->fLock); 1928 if (!locker.IsLocked()) 1929 return; 1930 1931 endpoint->SendAcknowledge(true); 1932 } 1933 1934 1935 /*static*/ void 1936 TCPEndpoint::_TimeWaitTimer(struct net_timer *timer, void *data) 1937 { 1938 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1939 1940 gSocketModule->delete_socket(endpoint->socket); 1941 } 1942 1943